(crawler) Make the use of virtual threads in the crawler configurable via system properties

(crawler) Add crawling metadata to domainstate db
Revert "(crawler) Further rearrange crawl order"
2025-10-06 17:32:39 +02:00 · 2025-03-27 21:26:05 +01:00 · 2025-03-27 16:38:37 +01:00 · 2025-03-27 11:25:08 +01:00
6 changed files with 151 additions and 16 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -103,10 +103,18 @@ public class CrawlerMain extends ProcessMainClass {
        this.blacklist = blacklist;
        this.node = processConfiguration.node();
        SimpleBlockingThreadPool.ThreadType threadType;
        if (Boolean.getBoolean("crawler.useVirtualThreads")) {
            threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
        }
        else {
            threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
        }
        pool = new SimpleBlockingThreadPool("CrawlerPool",
                Integer.getInteger("crawler.poolSize", 256),
                1,
-                SimpleBlockingThreadPool.ThreadType.VIRTUAL);
+                threadType);
        // Wait for the blacklist to be loaded before starting the crawl
@@ -302,8 +310,8 @@ public class CrawlerMain extends ProcessMainClass {
    }
    /** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
-     * we want to enqueue domains that tend ro be large and have common top domains first,
+     * we want to enqueue domains that have common top domains first, but otherwise have a random
-     * but otherwise have a random order.
+     * order.
     * <p></p>
     * Note, we can't use hash codes for randomization as it is not desirable to have the same order
     * every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
@@ -311,13 +319,15 @@ public class CrawlerMain extends ProcessMainClass {
     * */
    private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
        Random r = new Random();
        Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
        Map<String, Integer> randomOrder = new HashMap<>(records.size());
        for (var spec : records) {
            topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
            randomOrder.put(spec.domain, r.nextInt());
        }
-        return Comparator.comparing((CrawlSpecRecord spec) -> spec.domain.contains(".edu"))
+        return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
                .reversed()
                .thenComparing(spec -> randomOrder.get(spec.domain))
                .thenComparing(Record::hashCode); // non-deterministic tie-breaker to
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java
@@ -11,6 +11,7 @@ import java.nio.file.Path;
 import java.sql.Connection;
 import java.sql.DriverManager;
 import java.sql.SQLException;
 import java.time.Duration;
 import java.time.Instant;
 import java.util.Objects;
 import java.util.Optional;
@@ -24,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {
    private final Connection connection;
    public record CrawlMeta(
            String domainName,
            Instant lastFullCrawl,
            Duration recrawlTime,
            Duration crawlTime,
            int recrawlErrors,
            int crawlChanges,
            int totalCrawlSize
    ) {}
    public record SummaryRecord(
            String domainName,
            Instant lastUpdated,
@@ -102,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
                        feedUrl TEXT
                    )
                    """);
            stmt.executeUpdate("""
                    CREATE TABLE IF NOT EXISTS crawl_meta (
                        domain TEXT PRIMARY KEY,
                        lastFullCrawlEpochMs LONG NOT NULL,
                        recrawlTimeMs LONG NOT NULL,
                        recrawlErrors INTEGER NOT NULL,
                        crawlTimeMs LONG NOT NULL,
                        crawlChanges INTEGER NOT NULL,
                        totalCrawlSize INTEGER NOT NULL
                    )
                    """);
            stmt.executeUpdate("""
                    CREATE TABLE IF NOT EXISTS favicon (
                        domain TEXT PRIMARY KEY,
@@ -164,6 +187,26 @@ public class DomainStateDb implements AutoCloseable {
        return Optional.empty();
    }
    public void save(CrawlMeta crawlMeta) {
        if (connection == null) throw new IllegalStateException("No connection to domainstate db");
        try (var stmt = connection.prepareStatement("""
                INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
                VALUES (?, ?, ?, ?, ?, ?, ?)
                """)) {
            stmt.setString(1, crawlMeta.domainName());
            stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
            stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
            stmt.setInt(4, crawlMeta.recrawlErrors);
            stmt.setLong(5, crawlMeta.crawlTime.toMillis());
            stmt.setInt(6, crawlMeta.crawlChanges);
            stmt.setInt(7, crawlMeta.totalCrawlSize);
            stmt.executeUpdate();
        } catch (SQLException e) {
            logger.error("Failed to insert crawl meta record", e);
        }
    }
    public void save(SummaryRecord record) {
        if (connection == null) throw new IllegalStateException("No connection to domainstate db");
@@ -182,7 +225,35 @@ public class DomainStateDb implements AutoCloseable {
        }
    }
-    public Optional<SummaryRecord> get(String domainName) {
+    public Optional<CrawlMeta> getMeta(String domainName) {
        if (connection == null)
            return Optional.empty();
        try (var stmt = connection.prepareStatement("""
                SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
                FROM crawl_meta
                WHERE domain = ?
                """)) {
            stmt.setString(1, domainName);
            var rs = stmt.executeQuery();
            if (rs.next()) {
                return Optional.of(new CrawlMeta(
                        rs.getString("domain"),
                        Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
                        Duration.ofMillis(rs.getLong("recrawlTimeMs")),
                        Duration.ofMillis(rs.getLong("crawlTimeMs")),
                        rs.getInt("recrawlErrors"),
                        rs.getInt("crawlChanges"),
                        rs.getInt("totalCrawlSize")
                ));
            }
        } catch (SQLException ex) {
            logger.error("Failed to get crawl meta record", ex);
        }
        return Optional.empty();
    }
    public Optional<SummaryRecord> getSummary(String domainName) {
        if (connection == null)
            return Optional.empty();
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -29,6 +29,7 @@ import java.net.http.HttpResponse;
 import java.net.http.HttpTimeoutException;
 import java.time.Duration;
 import java.util.*;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Semaphore;
 import java.util.zip.GZIPInputStream;
@@ -56,12 +57,21 @@ public class HttpFetcherImpl implements HttpFetcher {
    private final HttpClient client;
    private HttpClient createClient() {
        final ExecutorService executorService;
        if (Boolean.getBoolean("crawler.httpclient.useVirtualThreads")) {
            executorService = Executors.newVirtualThreadPerTaskExecutor();
        }
        else {
            executorService = Executors.newCachedThreadPool();
        }
        return HttpClient.newBuilder()
                .sslContext(NoSecuritySSL.buildSslContext())
                .cookieHandler(cookies)
                .followRedirects(HttpClient.Redirect.NORMAL)
                .connectTimeout(Duration.ofSeconds(8))
-                .executor(Executors.newVirtualThreadPerTaskExecutor())
+                .executor(executorService)
                .build();
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -26,6 +26,8 @@ import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.nio.file.Path;
 import java.time.Duration;
 import java.time.Instant;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.TimeUnit;
@@ -113,15 +115,19 @@ public class CrawlerRetreiver implements AutoCloseable {
                        throw new InterruptedException();
                    }
                    Instant recrawlStart = Instant.now();
                    CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
                    Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
                    // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
-                    if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
+                    if (recrawlMetadata.size() > 0) {
                        // If we have reference data, we will always grow the crawl depth a bit
                        crawlFrontier.increaseDepth(1.5, 2500);
                    }
                    oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
-                    yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
+                    yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
                }
                case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
@@ -143,7 +149,11 @@ public class CrawlerRetreiver implements AutoCloseable {
    private int crawlDomain(EdgeUrl rootUrl,
                            SimpleRobotRules robotsRules,
                            CrawlDelayTimer delayTimer,
-                            DomainLinks domainLinks) {
+                            DomainLinks domainLinks,
                            CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
                            Duration recrawlTime) {
        Instant crawlStart = Instant.now();
        // Add external links to the crawl frontier
        crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
@@ -153,6 +163,8 @@ public class CrawlerRetreiver implements AutoCloseable {
            crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
        }
        int crawlerAdditions = 0;
        while (!crawlFrontier.isEmpty()
            && !crawlFrontier.isCrawlDepthReached()
            && errorCount < MAX_ERRORS
@@ -184,7 +196,11 @@ public class CrawlerRetreiver implements AutoCloseable {
                continue;
            try {
-                fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
+                var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
                if (result.isOk()) {
                    crawlerAdditions++;
                }
            }
            catch (InterruptedException ex) {
                Thread.currentThread().interrupt();
@@ -192,6 +208,17 @@ public class CrawlerRetreiver implements AutoCloseable {
            }
        }
        Duration crawlTime = Duration.between(crawlStart, Instant.now());
        domainStateDb.save(new DomainStateDb.CrawlMeta(
                domain,
                Instant.now(),
                recrawlTime,
                crawlTime,
                recrawlMetadata.errors(),
                crawlerAdditions,
                recrawlMetadata.size() + crawlerAdditions
        ));
        return crawlFrontier.visitedSize();
    }
@@ -324,7 +351,7 @@ public class CrawlerRetreiver implements AutoCloseable {
    );
    private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
-        var oldDomainStateRecord = domainStateDb.get(domain);
+        var oldDomainStateRecord = domainStateDb.getSummary(domain);
        // If we are already aware of an old feed URL, then we can just revalidate it
        if (oldDomainStateRecord.isPresent()) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -31,7 +31,7 @@ public class CrawlerRevisitor {
    }
    /** Performs a re-crawl of old documents, comparing etags and last-modified */
-    public int recrawl(CrawlDataReference oldCrawlData,
+    public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
                       SimpleRobotRules robotsRules,
                       CrawlDelayTimer delayTimer)
    throws InterruptedException {
@@ -39,6 +39,7 @@ public class CrawlerRevisitor {
        int retained = 0;
        int errors = 0;
        int skipped = 0;
        int size = 0;
        for (CrawledDocument doc : oldCrawlData) {
            if (errors > 20) {
@@ -82,6 +83,7 @@ public class CrawlerRevisitor {
                continue;
            }
            size++;
            double skipProb;
@@ -154,6 +156,8 @@ public class CrawlerRevisitor {
            }
        }
-        return recrawled;
+        return new RecrawlMetadata(size, errors, skipped);
    }
    public record RecrawlMetadata(int size, int errors, int skipped) {}
 }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java
@@ -8,6 +8,7 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
 import java.time.Duration;
 import java.time.Instant;
 import static org.junit.jupiter.api.Assertions.*;
@@ -47,8 +48,8 @@ class DomainStateDbTest {
            db.save(allFields);
            db.save(minFields);
-            assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
+            assertEquals(allFields, db.getSummary("all.marginalia.nu").orElseThrow());
-            assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
+            assertEquals(minFields, db.getSummary("min.marginalia.nu").orElseThrow());
            var updatedAllFields = new DomainStateDb.SummaryRecord(
                    "all.marginalia.nu",
@@ -59,7 +60,19 @@ class DomainStateDbTest {
            );
            db.save(updatedAllFields);
-            assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
+            assertEquals(updatedAllFields, db.getSummary("all.marginalia.nu").orElseThrow());
        }
    }
    @Test
    public void testMetadata() throws SQLException {
        try (var db = new DomainStateDb(tempFile)) {
            var original = new DomainStateDb.CrawlMeta("example.com", Instant.ofEpochMilli(12345), Duration.ofMillis(30), Duration.ofMillis(300), 1, 2, 3);
            db.save(original);
            var maybeMeta = db.getMeta("example.com");
            assertTrue(maybeMeta.isPresent());
            assertEquals(original, maybeMeta.get());
        }
    }
Author	SHA1	Message	Date
Viktor Lofgren	ecb0e57a1a	(crawler) Make the use of virtual threads in the crawler configurable via system properties	2025-03-27 21:26:05 +01:00
Viktor Lofgren	8c61f61b46	(crawler) Add crawling metadata to domainstate db	2025-03-27 16:38:37 +01:00
Viktor Lofgren	662a18c933	Revert "(crawler) Further rearrange crawl order" This reverts commit `1c2426a052`. The change does not appear necessary to avoid problems.	2025-03-27 11:25:08 +01:00