(status) Use old-search for status service instead of marginalia-search.com

(control) Add GUI for NSFW Filter Update trigger
(live-crawler) Use Apache HttpClient + code cleanup
2025-10-06 07:32:38 +02:00 · 2025-07-06 15:40:53 +02:00 · 2025-06-25 16:03:27 +02:00 · 2025-06-24 13:04:19 +02:00 · 2025-06-24 11:42:41 +02:00 · 2025-06-23 18:49:55 +02:00
34 changed files with 849 additions and 240 deletions
--- a/code/common/config/java/nu/marginalia/nodecfg/NodeConfigurationService.java
+++ b/code/common/config/java/nu/marginalia/nodecfg/NodeConfigurationService.java
@@ -45,7 +45,7 @@ public class NodeConfigurationService {
    public List<NodeConfiguration> getAll() {
        try (var conn = dataSource.getConnection();
             var qs = conn.prepareStatement("""
-                     SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
+                     SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
                     FROM NODE_CONFIGURATION
                     """)) {
            var rs = qs.executeQuery();
@@ -59,6 +59,7 @@ public class NodeConfigurationService {
                        rs.getBoolean("ACCEPT_QUERIES"),
                        rs.getBoolean("AUTO_CLEAN"),
                        rs.getBoolean("PRECESSION"),
+                        rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
                        rs.getBoolean("KEEP_WARCS"),
                        NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
                        rs.getBoolean("DISABLED")
@@ -75,7 +76,7 @@ public class NodeConfigurationService {
    public NodeConfiguration get(int nodeId) throws SQLException {
        try (var conn = dataSource.getConnection();
             var qs = conn.prepareStatement("""
-                     SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
+                     SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
                     FROM NODE_CONFIGURATION
                     WHERE ID=?
                     """)) {
@@ -88,6 +89,7 @@ public class NodeConfigurationService {
                        rs.getBoolean("ACCEPT_QUERIES"),
                        rs.getBoolean("AUTO_CLEAN"),
                        rs.getBoolean("PRECESSION"),
+                        rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
                        rs.getBoolean("KEEP_WARCS"),
                        NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
                        rs.getBoolean("DISABLED")
@@ -102,7 +104,7 @@ public class NodeConfigurationService {
        try (var conn = dataSource.getConnection();
             var us = conn.prepareStatement("""
                     UPDATE NODE_CONFIGURATION
-                     SET DESCRIPTION=?, ACCEPT_QUERIES=?,  AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
+                     SET DESCRIPTION=?, ACCEPT_QUERIES=?,  AUTO_CLEAN=?, PRECESSION=?, AUTO_ASSIGN_DOMAINS=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
                     WHERE ID=?
                     """))
        {
@@ -110,10 +112,11 @@ public class NodeConfigurationService {
            us.setBoolean(2, config.acceptQueries());
            us.setBoolean(3, config.autoClean());
            us.setBoolean(4, config.includeInPrecession());
-            us.setBoolean(5, config.keepWarcs());
-            us.setBoolean(6, config.disabled());
-            us.setString(7, config.profile().name());
-            us.setInt(8, config.node());
+            us.setBoolean(5, config.autoAssignDomains());
+            us.setBoolean(6, config.keepWarcs());
+            us.setBoolean(7, config.disabled());
+            us.setString(8, config.profile().name());
+            us.setInt(9, config.node());

            if (us.executeUpdate() <= 0)
                throw new IllegalStateException("Failed to update configuration");
--- a/code/common/config/java/nu/marginalia/nodecfg/model/NodeConfiguration.java
+++ b/code/common/config/java/nu/marginalia/nodecfg/model/NodeConfiguration.java
@@ -5,6 +5,7 @@ public record NodeConfiguration(int node,
                                boolean acceptQueries,
                                boolean autoClean,
                                boolean includeInPrecession,
+                                boolean autoAssignDomains,
                                boolean keepWarcs,
                                NodeProfile profile,
                                boolean disabled
--- a/code/common/config/java/nu/marginalia/nodecfg/model/NodeProfile.java
+++ b/code/common/config/java/nu/marginalia/nodecfg/model/NodeProfile.java
@@ -20,9 +20,7 @@ public enum NodeProfile {
    }

    public boolean permitBatchCrawl() {
-        return isBatchCrawl() ||isMixed();
-    }
-    public boolean permitSideload() {
-        return isMixed() || isSideload();
+        return isBatchCrawl() || isMixed();
    }
+    public boolean permitSideload() {  return isSideload() || isMixed(); }
 }
--- a/code/common/config/test/nu/marginalia/nodecfg/NodeConfigurationServiceTest.java
+++ b/code/common/config/test/nu/marginalia/nodecfg/NodeConfigurationServiceTest.java
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;

 import com.zaxxer.hikari.HikariConfig;
 import com.zaxxer.hikari.HikariDataSource;
+import nu.marginalia.nodecfg.model.NodeConfiguration;
 import nu.marginalia.nodecfg.model.NodeProfile;
 import nu.marginalia.test.TestMigrationLoader;
 import org.junit.jupiter.api.BeforeAll;
@@ -62,6 +63,63 @@ public class NodeConfigurationServiceTest {
        assertEquals(2, list.size());
        assertEquals(a, list.get(0));
        assertEquals(b, list.get(1));
+    }

+
+    // Test all the fields that are only exposed via save()
+    @Test
+    public void testSaveChanges() throws SQLException {
+        var original = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
+
+        assertEquals(1, original.node());
+        assertEquals("Test", original.description());
+        assertFalse(original.acceptQueries());
+
+        var precession = new NodeConfiguration(
+                original.node(),
+                "Foo",
+                true,
+                original.autoClean(),
+                original.includeInPrecession(),
+                !original.autoAssignDomains(),
+                original.keepWarcs(),
+                original.profile(),
+                original.disabled()
+        );
+
+        nodeConfigurationService.save(precession);
+        precession = nodeConfigurationService.get(original.node());
+        assertNotEquals(original.autoAssignDomains(), precession.autoAssignDomains());
+
+        var autoClean = new NodeConfiguration(
+                original.node(),
+                "Foo",
+                true,
+                !original.autoClean(),
+                original.includeInPrecession(),
+                original.autoAssignDomains(),
+                original.keepWarcs(),
+                original.profile(),
+                original.disabled()
+        );
+
+        nodeConfigurationService.save(autoClean);
+        autoClean = nodeConfigurationService.get(original.node());
+        assertNotEquals(original.autoClean(), autoClean.autoClean());
+
+        var disabled = new NodeConfiguration(
+                original.node(),
+                "Foo",
+                true,
+                autoClean.autoClean(),
+                autoClean.includeInPrecession(),
+                autoClean.autoAssignDomains(),
+                autoClean.keepWarcs(),
+                autoClean.profile(),
+                !autoClean.disabled()
+        );
+        nodeConfigurationService.save(disabled);
+        disabled = nodeConfigurationService.get(original.node());
+        assertNotEquals(autoClean.disabled(), disabled.disabled());
    }
 }
--- a/code/common/db/resources/db/migration/V25_06_0_006ndpnode_configuration.sql
+++ b/code/common/db/resources/db/migration/V25_06_0_006ndpnode_configuration.sql
@@ -0,0 +1,3 @@
+-- Migration script to add AUTO_ASSIGN_DOMAINS column to NODE_CONFIGURATION table
+
+ALTER TABLE NODE_CONFIGURATION ADD COLUMN AUTO_ASSIGN_DOMAINS BOOLEAN NOT NULL DEFAULT TRUE;
--- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorClient.java
+++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorClient.java
@@ -9,6 +9,7 @@ import nu.marginalia.executor.storage.FileStorageFile;
 import nu.marginalia.executor.upload.UploadDirContents;
 import nu.marginalia.executor.upload.UploadDirItem;
 import nu.marginalia.functions.execution.api.*;
+import nu.marginalia.mq.persistence.MqPersistence;
 import nu.marginalia.service.ServiceId;
 import nu.marginalia.service.client.GrpcChannelPoolFactory;
 import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
@@ -25,27 +26,37 @@ import java.net.URISyntaxException;
 import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
+import java.time.Duration;
 import java.util.List;

 import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;

@Singleton
 public class ExecutorClient {
+    private final MqPersistence persistence;
    private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
    private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
    private final ServiceRegistryIf registry;

    @Inject
    public ExecutorClient(ServiceRegistryIf registry,
+                          MqPersistence persistence,
                          GrpcChannelPoolFactory grpcChannelPoolFactory)
    {
        this.registry = registry;
+        this.persistence = persistence;
        this.channelPool = grpcChannelPoolFactory
                .createMulti(
                        ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
                        ExecutorApiGrpc::newBlockingStub);
    }

+    private long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
+        return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
+    }
+
+
+
    public void startFsm(int node, String actorName) {
        channelPool.call(ExecutorApiBlockingStub::startFsm)
                .forNode(node)
@@ -96,6 +107,16 @@ public class ExecutorClient {
                        .build());
    }

+    public long updateNsfwFilters() throws Exception {
+        long msgId = createTrackingTokenMsg("nsfw-filters", 1, Duration.ofHours(6));
+
+        channelPool.call(ExecutorApiBlockingStub::updateNsfwFilters)
+                .forNode(1)
+                .run(RpcUpdateNsfwFilters.newBuilder().setMsgId(msgId).build());
+
+        return msgId;
+    }
+
    public ActorRunStates getActorStates(int node) {
        try {
            var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
--- a/code/execution/api/src/main/protobuf/executor-api.proto
+++ b/code/execution/api/src/main/protobuf/executor-api.proto
@@ -18,6 +18,8 @@ service ExecutorApi {
  rpc calculateAdjacencies(Empty) returns (Empty) {}
  rpc restoreBackup(RpcFileStorageId) returns (Empty) {}

+  rpc updateNsfwFilters(RpcUpdateNsfwFilters) returns (Empty) {}
+
  rpc restartExecutorService(Empty) returns (Empty) {}
 }

@@ -66,6 +68,9 @@ message RpcExportRequest {
  int64 fileStorageId = 1;
  int64 msgId = 2;
 }
+message RpcUpdateNsfwFilters {
+  int64 msgId = 1;
+}
 message RpcFileStorageIdWithDomainName {
  int64 fileStorageId = 1;
  string targetDomainName = 2;
--- a/code/execution/java/nu/marginalia/actor/ExecutorActor.java
+++ b/code/execution/java/nu/marginalia/actor/ExecutorActor.java
@@ -6,7 +6,7 @@ import java.util.Set;

 public enum ExecutorActor {
    PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
-    SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
+    UPDATE_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD, NodeProfile.REALTIME),

    CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
    RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
--- a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java
+++ b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java
@@ -113,7 +113,7 @@ public class ExecutorActorControlService {
        register(ExecutorActor.UPDATE_RSS, updateRssActor);

        register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
-        register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
+        register(ExecutorActor.UPDATE_NSFW_LISTS, updateNsfwFiltersActor);

        if (serviceConfiguration.node() == 1) {
            register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
--- a/code/execution/java/nu/marginalia/actor/proc/PingMonitorActor.java
+++ b/code/execution/java/nu/marginalia/actor/proc/PingMonitorActor.java
@@ -25,6 +25,10 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;

+
+// Unlike other monitor actors, the ping monitor will not merely wait for a request
+// to be sent, but send one itself,  hence we can't extend AbstractProcessSpawnerActor
+// but have to reimplement a lot of the same logic ourselves.
@Singleton
 public class PingMonitorActor extends RecordActorPrototype {

@@ -53,7 +57,6 @@ public class PingMonitorActor extends RecordActorPrototype {
        return switch (self) {
            case Initial i -> {
                PingRequest request = new PingRequest();
-
                persistence.sendNewMessage(inboxName, null, null,
                        "PingRequest",
                        gson.toJson(request),
--- a/code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java
@@ -44,7 +44,6 @@ public class LiveCrawlActor extends RecordActorPrototype {

    @Override
    public ActorStep transition(ActorStep self) throws Exception {
-        logger.info("{}", self);
        return switch (self) {
            case Initial() -> {
                yield new Monitor("-");
--- a/code/execution/java/nu/marginalia/actor/task/UpdateNsfwFiltersActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/UpdateNsfwFiltersActor.java
@@ -5,6 +5,8 @@ import com.google.inject.Inject;
 import com.google.inject.Singleton;
 import nu.marginalia.actor.prototype.RecordActorPrototype;
 import nu.marginalia.actor.state.ActorStep;
+import nu.marginalia.mq.MqMessageState;
+import nu.marginalia.mq.persistence.MqPersistence;
 import nu.marginalia.nsfw.NsfwDomainFilter;
 import nu.marginalia.service.module.ServiceConfiguration;

@@ -12,23 +14,26 @@ import nu.marginalia.service.module.ServiceConfiguration;
 public class UpdateNsfwFiltersActor extends RecordActorPrototype {
    private final ServiceConfiguration serviceConfiguration;
    private final NsfwDomainFilter nsfwDomainFilter;
+    private final MqPersistence persistence;

-    public record Initial() implements ActorStep {}
-    public record Run() implements ActorStep {}
+    public record Initial(long respondMsgId) implements ActorStep {}
+    public record Run(long respondMsgId) implements ActorStep {}

    @Override
    public ActorStep transition(ActorStep self) throws Exception {
        return switch(self) {
-            case Initial() -> {
+            case Initial(long respondMsgId) -> {
                if (serviceConfiguration.node() != 1) {
+                    persistence.updateMessageState(respondMsgId, MqMessageState.ERR);
                    yield new Error("This actor can only run on node 1");
                }
                else {
-                    yield new Run();
+                    yield new Run(respondMsgId);
                }
            }
-            case Run() -> {
+            case Run(long respondMsgId) -> {
                nsfwDomainFilter.fetchLists();
+                persistence.updateMessageState(respondMsgId, MqMessageState.OK);
                yield new End();
            }
            default -> new Error();
@@ -43,11 +48,13 @@ public class UpdateNsfwFiltersActor extends RecordActorPrototype {
    @Inject
    public UpdateNsfwFiltersActor(Gson gson,
                                  ServiceConfiguration serviceConfiguration,
-                                  NsfwDomainFilter nsfwDomainFilter)
+                                  NsfwDomainFilter nsfwDomainFilter,
+                                  MqPersistence persistence)
    {
        super(gson);
        this.serviceConfiguration = serviceConfiguration;
        this.nsfwDomainFilter = nsfwDomainFilter;
+        this.persistence = persistence;
    }

 }
--- a/code/execution/java/nu/marginalia/execution/ExecutorGrpcService.java
+++ b/code/execution/java/nu/marginalia/execution/ExecutorGrpcService.java
@@ -10,6 +10,7 @@ import nu.marginalia.actor.state.ActorStateInstance;
 import nu.marginalia.actor.task.DownloadSampleActor;
 import nu.marginalia.actor.task.RestoreBackupActor;
 import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
+import nu.marginalia.actor.task.UpdateNsfwFiltersActor;
 import nu.marginalia.functions.execution.api.*;
 import nu.marginalia.service.module.ServiceConfiguration;
 import nu.marginalia.service.server.DiscoverableService;
@@ -263,4 +264,19 @@ public class ExecutorGrpcService
        System.exit(0);
    }

+    @Override
+    public void updateNsfwFilters(RpcUpdateNsfwFilters request, StreamObserver<Empty> responseObserver) {
+        logger.info("Got request {}", request);
+        try {
+            actorControlService.startFrom(ExecutorActor.UPDATE_NSFW_LISTS,
+                    new UpdateNsfwFiltersActor.Initial(request.getMsgId()));
+
+            responseObserver.onNext(Empty.getDefaultInstance());
+            responseObserver.onCompleted();
+        }
+        catch (Exception e) {
+            logger.error("Failed to update nsfw filters", e);
+            responseObserver.onError(e);
+        }
+    }
 }
--- a/code/functions/live-capture/api/java/nu/marginalia/api/feeds/FeedsClient.java
+++ b/code/functions/live-capture/api/java/nu/marginalia/api/feeds/FeedsClient.java
@@ -11,6 +11,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
 import nu.marginalia.service.module.ServiceConfiguration;

 import javax.annotation.CheckReturnValue;
+import java.time.Duration;
 import java.time.Instant;
 import java.util.ArrayList;
 import java.util.List;
@@ -59,6 +60,11 @@ public class FeedsClient {
                .forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
    }

+    public boolean waitReady(Duration duration) throws InterruptedException {
+        return channelPool.awaitChannel(duration);
+    }
+
+
    /** Get the hash of the feed data, for identifying when the data has been updated */
    public String getFeedDataHash() {
        return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
--- a/code/functions/live-capture/build.gradle
+++ b/code/functions/live-capture/build.gradle
@@ -35,6 +35,7 @@ dependencies {
    implementation libs.bundles.slf4j
    implementation libs.commons.lang3
    implementation libs.commons.io
+    implementation libs.httpclient
    implementation libs.wiremock

    implementation libs.prometheus
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@@ -20,19 +20,36 @@ import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageType;
 import nu.marginalia.util.SimpleBlockingThreadPool;
+import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
+import org.apache.hc.client5.http.classic.HttpClient;
+import org.apache.hc.client5.http.config.ConnectionConfig;
+import org.apache.hc.client5.http.config.RequestConfig;
+import org.apache.hc.client5.http.cookie.StandardCookieSpec;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
+import org.apache.hc.core5.http.Header;
+import org.apache.hc.core5.http.HeaderElement;
+import org.apache.hc.core5.http.HeaderElements;
+import org.apache.hc.core5.http.HttpResponse;
+import org.apache.hc.core5.http.io.SocketConfig;
+import org.apache.hc.core5.http.io.entity.EntityUtils;
+import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
+import org.apache.hc.core5.http.message.MessageSupport;
+import org.apache.hc.core5.http.protocol.HttpContext;
+import org.apache.hc.core5.util.TimeValue;
+import org.apache.hc.core5.util.Timeout;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import javax.annotation.Nullable;
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
-import java.net.http.HttpClient;
-import java.net.http.HttpRequest;
-import java.net.http.HttpResponse;
 import java.sql.SQLException;
-import java.time.*;
+import java.time.Instant;
+import java.time.LocalDateTime;
+import java.time.ZoneId;
+import java.time.ZonedDateTime;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
 import java.util.concurrent.ExecutorService;
@@ -55,6 +72,8 @@ public class FeedFetcherService {

    private final DomainCoordinator domainCoordinator;

+    private final HttpClient httpClient;
+
    private volatile boolean updating;

    @Inject
@@ -71,6 +90,83 @@ public class FeedFetcherService {
        this.serviceHeartbeat = serviceHeartbeat;
        this.executorClient = executorClient;
        this.domainCoordinator = domainCoordinator;
+
+        final ConnectionConfig connectionConfig = ConnectionConfig.custom()
+                .setSocketTimeout(15, TimeUnit.SECONDS)
+                .setConnectTimeout(15, TimeUnit.SECONDS)
+                .setValidateAfterInactivity(TimeValue.ofSeconds(5))
+                .build();
+
+
+        var connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
+                .setMaxConnPerRoute(2)
+                .setMaxConnTotal(50)
+                .setDefaultConnectionConfig(connectionConfig)
+                .build();
+
+        connectionManager.setDefaultSocketConfig(SocketConfig.custom()
+                .setSoLinger(TimeValue.ofSeconds(-1))
+                .setSoTimeout(Timeout.ofSeconds(10))
+                .build()
+        );
+
+        Thread.ofPlatform().daemon(true).start(() -> {
+            try {
+                for (;;) {
+                    TimeUnit.SECONDS.sleep(15);
+                    logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
+                }
+            }
+            catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        });
+
+        final RequestConfig defaultRequestConfig = RequestConfig.custom()
+                .setCookieSpec(StandardCookieSpec.IGNORE)
+                .setResponseTimeout(10, TimeUnit.SECONDS)
+                .setConnectionRequestTimeout(5, TimeUnit.MINUTES)
+                .build();
+
+        httpClient = HttpClients.custom()
+                .setDefaultRequestConfig(defaultRequestConfig)
+                .setConnectionManager(connectionManager)
+                .setUserAgent(WmsaHome.getUserAgent().uaIdentifier())
+                .setConnectionManager(connectionManager)
+                .setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
+                    // Default keep-alive duration is 3 minutes, but this is too long for us,
+                    // as we are either going to re-use it fairly quickly or close it for a long time.
+                    //
+                    // So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
+                    private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
+
+                    @Override
+                    public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
+                        final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
+
+                        while (it.hasNext()) {
+                            final HeaderElement he = it.next();
+                            final String param = he.getName();
+                            final String value = he.getValue();
+
+                            if (value == null)
+                                continue;
+                            if (!"timeout".equalsIgnoreCase(param))
+                                continue;
+
+                            try {
+                                long timeout = Long.parseLong(value);
+                                timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
+                                return TimeValue.ofSeconds(timeout);
+                            } catch (final NumberFormatException ignore) {
+                                break;
+                            }
+                        }
+                        return defaultValue;
+                    }
+                })
+                .build();
+
    }

    public enum UpdateMode {
@@ -86,13 +182,7 @@ public class FeedFetcherService {


        try (FeedDbWriter writer = feedDb.createWriter();
-             HttpClient client = HttpClient.newBuilder()
-                .connectTimeout(Duration.ofSeconds(15))
-                .executor(Executors.newCachedThreadPool())
-                .followRedirects(HttpClient.Redirect.NORMAL)
-                .version(HttpClient.Version.HTTP_2)
-                .build();
-             ExecutorService fetchExecutor = Executors.newCachedThreadPool();
+             ExecutorService fetchExecutor = Executors.newVirtualThreadPerTaskExecutor();
             FeedJournal feedJournal = FeedJournal.create();
             var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
        ) {
@@ -137,7 +227,8 @@ public class FeedFetcherService {

                        FetchResult feedData;
                        try (DomainLock domainLock = domainCoordinator.lockDomain(new EdgeDomain(feed.domain()))) {
-                            feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
+                            feedData = fetchFeedData(feed, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
+                            TimeUnit.SECONDS.sleep(1); // Sleep before we yield the lock to avoid hammering the server from multiple processes
                        } catch (Exception ex) {
                            feedData = new FetchResult.TransientError();
                        }
@@ -216,7 +307,6 @@ public class FeedFetcherService {
    }

    private FetchResult fetchFeedData(FeedDefinition feed,
-                                      HttpClient client,
                                      ExecutorService executorService,
                                      @Nullable String ifModifiedSinceDate,
                                      @Nullable String ifNoneMatchTag)
@@ -224,59 +314,63 @@ public class FeedFetcherService {
        try {
            URI uri = new URI(feed.feedUrl());

-            HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
-                    .GET()
-                    .uri(uri)
-                    .header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
-                    .header("Accept-Encoding", "gzip")
-                    .header("Accept", "text/*, */*;q=0.9")
-                    .timeout(Duration.ofSeconds(15))
-                    ;
+            var requestBuilder = ClassicRequestBuilder.get(uri)
+                    .setHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
+                    .setHeader("Accept-Encoding", "gzip")
+                    .setHeader("Accept", "text/*, */*;q=0.9");

            // Set the If-Modified-Since or If-None-Match headers if we have them
            // though since there are certain idiosyncrasies in server implementations,
            // we avoid setting both at the same time as that may turn a 304 into a 200.
            if (ifNoneMatchTag != null) {
-                requestBuilder.header("If-None-Match", ifNoneMatchTag);
+                requestBuilder.addHeader("If-None-Match", ifNoneMatchTag);
            } else if (ifModifiedSinceDate != null) {
-                requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
+                requestBuilder.addHeader("If-Modified-Since", ifModifiedSinceDate);
            }

+            return httpClient.execute(requestBuilder.build(), rsp -> {
+                try {
+                    logger.info("Code: {}, URL: {}", rsp.getCode(), uri);

-            HttpRequest getRequest = requestBuilder.build();
+                    switch (rsp.getCode()) {
+                        case 200 -> {
+                            if (rsp.getEntity() == null) {
+                                return new FetchResult.TransientError(); // No content to read, treat as transient error
+                            }
+                            byte[] responseData = EntityUtils.toByteArray(rsp.getEntity());

-            for (int i = 0; i < 3; i++) {
+                            // Decode the response body based on the Content-Type header
+                            Header contentTypeHeader = rsp.getFirstHeader("Content-Type");
+                            if (contentTypeHeader == null) {
+                                return new FetchResult.TransientError();
+                            }
+                            String contentType = contentTypeHeader.getValue();
+                            String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);

-                /* Note we need to use an executor to time-limit the send() method in HttpClient, as
-                 * its support for timeouts only applies to the time until response starts to be received,
-                 * and does not catch the case when the server starts to send data but then hangs.
-                 */
-                HttpResponse<byte[]> rs = executorService.submit(
-                        () -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
-                                .get(15, TimeUnit.SECONDS);
+                            // Grab the ETag header if it exists
+                            Header etagHeader = rsp.getFirstHeader("ETag");
+                            String newEtagValue = etagHeader == null ? null : etagHeader.getValue();

-                if (rs.statusCode() == 429) { // Too Many Requests
-                    int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
-                    Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
-                    continue;
-                }
-
-                String newEtagValue = rs.headers().firstValue("ETag").orElse("");
-
-                return switch (rs.statusCode()) {
-                    case 200 -> {
-                        byte[] responseData = getResponseData(rs);
-
-                        String contentType = rs.headers().firstValue("Content-Type").orElse("");
-                        String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
-
-                        yield new FetchResult.Success(bodyText, newEtagValue);
+                            return new FetchResult.Success(bodyText, newEtagValue);
+                        }
+                        case 304 -> {
+                            return new FetchResult.NotModified(); // via If-Modified-Since semantics
+                        }
+                        case 404 -> {
+                            return new FetchResult.PermanentError(); // never try again
+                        }
+                        default -> {
+                            return new FetchResult.TransientError(); // we try again later
+                        }
                    }
-                    case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
-                    case 404 -> new FetchResult.PermanentError(); // never try again
-                    default -> new FetchResult.TransientError(); // we try again later
-                };
-            }
+                }
+                catch (Exception ex) {
+                    return new FetchResult.PermanentError(); // treat as permanent error
+                }
+                finally {
+                    EntityUtils.consumeQuietly(rsp.getEntity());
+                }
+            });
        }
        catch (Exception ex) {
            logger.debug("Error fetching feed", ex);
@@ -285,19 +379,6 @@ public class FeedFetcherService {
        return new FetchResult.TransientError();
    }

-    private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
-        String encoding = response.headers().firstValue("Content-Encoding").orElse("");
-
-        if ("gzip".equals(encoding)) {
-            try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
-                return stream.readAllBytes();
-            }
-        }
-        else {
-            return response.body();
-        }
-    }
-
    public sealed interface FetchResult {
        record Success(String value, String etag) implements FetchResult {}
        record NotModified() implements FetchResult {}
--- a/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java
+++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java
@@ -5,6 +5,8 @@ import com.google.inject.Guice;
 import com.google.inject.name.Names;
 import com.zaxxer.hikari.HikariConfig;
 import com.zaxxer.hikari.HikariDataSource;
+import nu.marginalia.coordination.DomainCoordinator;
+import nu.marginalia.coordination.LocalDomainCoordinator;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.rss.db.FeedDb;
 import nu.marginalia.rss.model.FeedItems;
@@ -82,6 +84,7 @@ class FeedFetcherServiceTest extends AbstractModule {
    }

    public void configure() {
+        bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
        bind(HikariDataSource.class).toInstance(dataSource);
        bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
        bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
--- a/code/processes/live-crawling-process/build.gradle
+++ b/code/processes/live-crawling-process/build.gradle
@@ -50,6 +50,7 @@ dependencies {

    implementation libs.notnull
    implementation libs.guava
+    implementation libs.httpclient
    implementation dependencies.create(libs.guice.get()) {
        exclude group: 'com.google.guava'
    }
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java
@@ -15,6 +15,7 @@ import nu.marginalia.coordination.DomainCoordinator;
 import nu.marginalia.db.DbDomainQueries;
 import nu.marginalia.db.DomainBlacklist;
 import nu.marginalia.io.SerializableCrawlDataStream;
+import nu.marginalia.livecrawler.io.HttpClientProvider;
 import nu.marginalia.loading.LoaderInputData;
 import nu.marginalia.loading.documents.DocumentLoaderService;
 import nu.marginalia.loading.documents.KeywordLoaderService;
@@ -32,12 +33,15 @@ import nu.marginalia.service.module.ServiceDiscoveryModule;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorageBaseType;
 import org.apache.commons.io.FileUtils;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.core5.io.CloseMode;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.Security;
+import java.time.Duration;
 import java.time.Instant;
 import java.time.temporal.ChronoUnit;
 import java.util.HashMap;
@@ -74,7 +78,9 @@ public class LiveCrawlerMain extends ProcessMainClass {
                           DomainProcessor domainProcessor,
                           FileStorageService fileStorageService,
                           KeywordLoaderService keywordLoaderService,
-                           DocumentLoaderService documentLoaderService, DomainCoordinator domainCoordinator, HikariDataSource dataSource)
+                           DocumentLoaderService documentLoaderService,
+                           DomainCoordinator domainCoordinator,
+                           HikariDataSource dataSource)
            throws Exception
    {
        super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX);
@@ -148,7 +154,10 @@ public class LiveCrawlerMain extends ProcessMainClass {
    }

    private void run() throws Exception {
-        Path basePath = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE).asPath().resolve("live-crawl-data");
+        Path basePath = fileStorageService
+                .getStorageBase(FileStorageBaseType.STORAGE)
+                .asPath()
+                .resolve("live-crawl-data");

        if (!Files.isDirectory(basePath)) {
            Files.createDirectories(basePath);
@@ -163,21 +172,38 @@ public class LiveCrawlerMain extends ProcessMainClass {
        {
            final Instant cutoff = Instant.now().minus(60, ChronoUnit.DAYS);

+            /* ------------------------------------------------ */
+            /* Fetch the latest domains from the feeds database */
+            /* ------------------------------------------------ */
+
            processHeartbeat.progress(LiveCrawlState.FETCH_LINKS);

            Map<String, List<String>> urlsPerDomain = new HashMap<>(10_000);
+            if (!feedsClient.waitReady(Duration.ofHours(1))) {
+                throw new RuntimeException("Feeds client never became ready, cannot proceed with live crawling");
+            }
            feedsClient.getUpdatedDomains(cutoff, urlsPerDomain::put);

            logger.info("Fetched data for {} domains", urlsPerDomain.size());

+
+            /* ------------------------------------- */
+            /* Prune the database from old entries   */
+            /* ------------------------------------- */
+
            processHeartbeat.progress(LiveCrawlState.PRUNE_DB);

-            // Remove data that is too old
            dataSet.prune(cutoff);

+
+            /* ------------------------------------- */
+            /* Fetch the links for each domain       */
+            /* ------------------------------------- */
+
            processHeartbeat.progress(LiveCrawlState.CRAWLING);

-            try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainCoordinator, domainQueries, domainBlacklist);
+            CloseableHttpClient client = HttpClientProvider.createClient();
+            try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainCoordinator, domainQueries, client, domainBlacklist);
                 var hb = heartbeat.createAdHocTaskHeartbeat("Live Crawling"))
            {
                for (Map.Entry<String, List<String>> entry : hb.wrap("Fetching", urlsPerDomain.entrySet())) {
@@ -190,18 +216,29 @@ public class LiveCrawlerMain extends ProcessMainClass {
                    fetcher.scheduleRetrieval(domain, urls);
                }
            }
+            finally {
+                client.close(CloseMode.GRACEFUL);
+            }

            Path tempPath = dataSet.createWorkDir();

+
            try {
+                /* ------------------------------------- */
+                /* Process the fetched links             */
+                /* ------------------------------------- */
+
                processHeartbeat.progress(LiveCrawlState.PROCESSING);

                try (var hb = heartbeat.createAdHocTaskHeartbeat("Processing");
                     var writer = new ConverterBatchWriter(tempPath, 0)
                ) {
-                    // Offset the documents' ordinals toward the upper range, to avoid an ID collisions with the
-                    // main indexes (the maximum permissible for doc ordinal is  value is 67_108_863, so this
-                    // leaves us with a lot of headroom still)
+                    // We need unique document ids that do not collide with the document id from the main index,
+                    // so we offset the documents' ordinals toward the upper range.
+                    //
+                    // The maximum permissible for doc ordinal is value is 67_108_863,
+                    // so this leaves us with a lot of headroom still!
+                    // Expected document count here is order of 10 :^)
                    writer.setOrdinalOffset(67_000_000);

                    for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
@@ -209,10 +246,15 @@ public class LiveCrawlerMain extends ProcessMainClass {
                    }
                }

+
+                /* ---------------------------------------------- */
+                /* Load the processed data into the link database */
+                /* and construct an index journal for the docs    */
+                /* ---------------------------------------------- */
+
                processHeartbeat.progress(LiveCrawlState.LOADING);

                LoaderInputData lid = new LoaderInputData(tempPath, 1);
-
                DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource);

                keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid);
@@ -224,9 +266,16 @@ public class LiveCrawlerMain extends ProcessMainClass {
                FileUtils.deleteDirectory(tempPath.toFile());
            }

-            // Construct the index
+
+            /* ------------------------------------- */
+            /*  Finish up                            */
+            /* ------------------------------------- */

            processHeartbeat.progress(LiveCrawlState.DONE);
+
+            // After we return from here, the LiveCrawlActor will trigger an index construction
+            // job.  Unlike all the stuff we did in this process, it's identical to the real job
+            // so we don't need to do anything special from this process
        }
    }

--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java
@@ -7,7 +7,6 @@ import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.contenttype.DocumentBodyToString;
 import nu.marginalia.coordination.DomainCoordinator;
 import nu.marginalia.coordination.DomainLock;
-import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
 import nu.marginalia.db.DbDomainQueries;
 import nu.marginalia.db.DomainBlacklist;
@@ -15,24 +14,21 @@ import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.util.SimpleBlockingThreadPool;
+import org.apache.hc.client5.http.classic.HttpClient;
+import org.apache.hc.core5.http.ClassicHttpRequest;
+import org.apache.hc.core5.http.io.entity.EntityUtils;
+import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import javax.annotation.Nullable;
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.net.URISyntaxException;
-import java.net.http.HttpClient;
-import java.net.http.HttpHeaders;
-import java.net.http.HttpRequest;
-import java.net.http.HttpResponse;
-import java.time.Duration;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
-import java.util.zip.GZIPInputStream;

 /** A simple link scraper that fetches URLs and stores them in a database,
 * with no concept of a crawl frontier, WARC output, or other advanced features
@@ -45,20 +41,21 @@ public class SimpleLinkScraper implements AutoCloseable {
    private final LiveCrawlDataSet dataSet;
    private final DbDomainQueries domainQueries;
    private final DomainBlacklist domainBlacklist;
-    private final Duration connectTimeout = Duration.ofSeconds(10);
-    private final Duration readTimeout = Duration.ofSeconds(10);
    private final DomainCoordinator domainCoordinator;

    private final static int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
+    private final HttpClient httpClient;

    public SimpleLinkScraper(LiveCrawlDataSet dataSet,
                             DomainCoordinator domainCoordinator,
                             DbDomainQueries domainQueries,
+                             HttpClient httpClient,
                             DomainBlacklist domainBlacklist) {
        this.dataSet = dataSet;
        this.domainCoordinator = domainCoordinator;
        this.domainQueries = domainQueries;
        this.domainBlacklist = domainBlacklist;
+        this.httpClient = httpClient;
    }

    public void scheduleRetrieval(EdgeDomain domain, List<String> urls) {
@@ -75,17 +72,19 @@ public class SimpleLinkScraper implements AutoCloseable {

        EdgeUrl rootUrl = domain.toRootUrlHttps();

-        List<EdgeUrl> relevantUrls = new ArrayList<>();
+        List<EdgeUrl> relevantUrls = new ArrayList<>(Math.max(1, urls.size()));

+        // Resolve absolute URLs
        for (var url : urls) {
            Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
-            if (optParsedUrl.isEmpty()) {
+
+            if (optParsedUrl.isEmpty())
                continue;
-            }
-            if (dataSet.hasUrl(optParsedUrl.get())) {
-                continue;
-            }
-            relevantUrls.add(optParsedUrl.get());
+
+            EdgeUrl absoluteUrl = optParsedUrl.get();
+
+            if (!dataSet.hasUrl(absoluteUrl))
+                relevantUrls.add(absoluteUrl);
        }

        if (relevantUrls.isEmpty()) {
@@ -94,16 +93,10 @@ public class SimpleLinkScraper implements AutoCloseable {

        int fetched = 0;

-        try (HttpClient client = HttpClient
-                .newBuilder()
-                .connectTimeout(connectTimeout)
-                .followRedirects(HttpClient.Redirect.NEVER)
-                .version(HttpClient.Version.HTTP_2)
-                .build();
-             // throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
+        try (// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
             DomainLock lock = domainCoordinator.lockDomain(domain)
        ) {
-            SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);
+            SimpleRobotRules rules = fetchRobotsRules(rootUrl);

            if (rules == null) { // I/O error fetching robots.txt
                // If we can't fetch the robots.txt,
@@ -116,18 +109,19 @@ public class SimpleLinkScraper implements AutoCloseable {
            CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay());

            for (var parsedUrl : relevantUrls) {
-
                if (!rules.isAllowed(parsedUrl.toString())) {
                    maybeFlagAsBad(parsedUrl);
                    continue;
                }

-                switch (fetchUrl(domainId, parsedUrl, timer, client)) {
+                switch (fetchUrl(domainId, parsedUrl, timer)) {
                    case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) -> {
                            dataSet.saveDocument(id, docUrl, body, headers, "");
                            fetched++;
                    }
-                    case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl);
+                    case FetchResult.Error(EdgeUrl docUrl) -> {
+                        maybeFlagAsBad(docUrl);
+                    }
                }
            }
        }
@@ -150,111 +144,107 @@ public class SimpleLinkScraper implements AutoCloseable {
    }

    @Nullable
-    private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl, HttpClient client) throws IOException, InterruptedException, URISyntaxException {
-        var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
-                .GET()
-                .header("User-Agent", WmsaHome.getUserAgent().uaString())
-                .header("Accept-Encoding","gzip")
-                .timeout(readTimeout);
-
-        // Fetch the robots.txt
+    private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl) throws URISyntaxException {
+        ClassicHttpRequest request = ClassicRequestBuilder.get(rootUrl.withPathAndParam("/robots.txt", null).asURI())
+                .setHeader("User-Agent", WmsaHome.getUserAgent().uaString())
+                .setHeader("Accept-Encoding", "gzip")
+                .build();

        try {
-            SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
-            HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
-
-            if (robotsTxt.statusCode() == 200) {
-                return parser.parseContent(rootUrl.toString(),
-                        getResponseData(robotsTxt),
-                        robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
-                        WmsaHome.getUserAgent().uaIdentifier());
+            return httpClient.execute(request, rsp -> {
+                if (rsp.getEntity() == null) {
+                    return null;
+                }
+                try {
+                    if (rsp.getCode() == 200) {
+                        var contentTypeHeader = rsp.getFirstHeader("Content-Type");
+                        if (contentTypeHeader == null) {
+                            return null; // No content type header, can't parse
+                        }
+                        return new SimpleRobotRulesParser().parseContent(
+                                rootUrl.toString(),
+                                EntityUtils.toByteArray(rsp.getEntity()),
+                                contentTypeHeader.getValue(),
+                                WmsaHome.getUserAgent().uaIdentifier()
+                        );
+                    } else if (rsp.getCode() == 404) {
+                        return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
+                    }
+                } finally {
+                    EntityUtils.consumeQuietly(rsp.getEntity());
+                }
+                return null;
+            });
+        }
+        catch (IOException e) {
+            logger.error("Error fetching robots.txt for {}: {}", rootUrl, e.getMessage());
+            return null; // I/O error fetching robots.txt
+        }
+        finally {
+            try {
+                TimeUnit.SECONDS.sleep(1);
            }
-            else if (robotsTxt.statusCode() == 404) {
-                return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
+            catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+                throw new RuntimeException(e);
            }
        }
-        catch (IOException ex) {
-            logger.error("Error fetching robots.txt for {}: {} {}", rootUrl, ex.getClass().getSimpleName(), ex.getMessage());
-        }
-        return null;
    }

    /** Fetch a URL and store it in the database
     */
-    private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
+    private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer) throws Exception {

-        timer.waitFetchDelay();
-
-        HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
-                .GET()
-                .header("User-Agent", WmsaHome.getUserAgent().uaString())
-                .header("Accept", "text/html")
-                .header("Accept-Encoding", "gzip")
-                .timeout(readTimeout)
+        ClassicHttpRequest request = ClassicRequestBuilder.get(parsedUrl.asURI())
+                .setHeader("User-Agent", WmsaHome.getUserAgent().uaString())
+                .setHeader("Accept", "text/html")
+                .setHeader("Accept-Encoding", "gzip")
                .build();

        try {
-            HttpResponse<byte[]> response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
+            return httpClient.execute(request, rsp -> {
+                try {
+                    if (rsp.getCode() == 200) {
+                        String contentType = rsp.getFirstHeader("Content-Type").getValue();
+                        if (!contentType.toLowerCase().startsWith("text/html")) {
+                            return new FetchResult.Error(parsedUrl);
+                        }

-            // Handle rate limiting by waiting and retrying once
-            if (response.statusCode() == 429) {
-                timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
-                        response.headers().firstValue("Retry-After").orElse("5")
-                ));
-                response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
-            }
+                        byte[] body = EntityUtils.toByteArray(rsp.getEntity(), MAX_SIZE);

-            String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
+                        String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);

-            if (response.statusCode() == 200) {
-                if (!contentType.toLowerCase().startsWith("text/html")) {
-                    return new FetchResult.Error(parsedUrl);
+                        StringBuilder headersStr = new StringBuilder();
+                        for (var header : rsp.getHeaders()) {
+                            headersStr.append(header.getName()).append(": ").append(header.getValue()).append("\n");
+                        }
+
+                        return new FetchResult.Success(domainId, parsedUrl, bodyText, headersStr.toString());
+                    }
+                } finally {
+                    if (rsp.getEntity() != null) {
+                        EntityUtils.consumeQuietly(rsp.getEntity());
+                    }
                }
-
-                byte[] body = getResponseData(response);
-                if (body.length > MAX_SIZE) {
-                    return new FetchResult.Error(parsedUrl);
-                }
-
-                String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
-
-                return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers()));
-            }
+                return new FetchResult.Error(parsedUrl);
+            });
        }
-        catch (IOException ex) {
-            // We don't want a full stack trace on every error, as it's quite common and very noisy
-            logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage());
+        catch (IOException e) {
+            logger.error("Error fetching {}: {}", parsedUrl, e.getMessage());
+            // If we can't fetch the URL, we return an error result
+            // so that the caller can decide what to do with it.
+        }
+        finally {
+            timer.waitFetchDelay();
        }
-
        return new FetchResult.Error(parsedUrl);
    }

-    private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
-        String encoding = response.headers().firstValue("Content-Encoding").orElse("");
-
-        if ("gzip".equals(encoding)) {
-            try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
-                return stream.readAllBytes();
-            }
-        }
-        else {
-            return response.body();
-        }
-    }
-
    sealed interface FetchResult {
        record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
        record Error(EdgeUrl url) implements FetchResult {}
    }

-    private String headersToString(HttpHeaders headers) {
-        StringBuilder headersStr = new StringBuilder();
-        headers.map().forEach((k, v) -> {
-            headersStr.append(k).append(": ").append(v).append("\n");
-        });
-        return headersStr.toString();
-    }
-
    @Override
    public void close() throws Exception {
        pool.shutDown();
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/io/HttpClientProvider.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/io/HttpClientProvider.java
@@ -0,0 +1,126 @@
+package nu.marginalia.livecrawler.io;
+
+import com.google.inject.Provider;
+import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
+import org.apache.hc.client5.http.classic.HttpClient;
+import org.apache.hc.client5.http.config.ConnectionConfig;
+import org.apache.hc.client5.http.config.RequestConfig;
+import org.apache.hc.client5.http.cookie.StandardCookieSpec;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
+import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
+import org.apache.hc.core5.http.HeaderElement;
+import org.apache.hc.core5.http.HeaderElements;
+import org.apache.hc.core5.http.HttpResponse;
+import org.apache.hc.core5.http.io.SocketConfig;
+import org.apache.hc.core5.http.message.MessageSupport;
+import org.apache.hc.core5.http.protocol.HttpContext;
+import org.apache.hc.core5.util.TimeValue;
+import org.apache.hc.core5.util.Timeout;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.security.KeyManagementException;
+import java.security.NoSuchAlgorithmException;
+import java.util.Iterator;
+import java.util.concurrent.TimeUnit;
+
+public class HttpClientProvider implements Provider<HttpClient> {
+    private static final HttpClient client;
+    private static PoolingHttpClientConnectionManager connectionManager;
+
+    private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
+
+    static {
+        try {
+            client = createClient();
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
+        final ConnectionConfig connectionConfig = ConnectionConfig.custom()
+                .setSocketTimeout(15, TimeUnit.SECONDS)
+                .setConnectTimeout(15, TimeUnit.SECONDS)
+                .setValidateAfterInactivity(TimeValue.ofSeconds(5))
+                .build();
+
+
+        connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
+                .setMaxConnPerRoute(2)
+                .setMaxConnTotal(50)
+                .setDefaultConnectionConfig(connectionConfig)
+                .build();
+
+        connectionManager.setDefaultSocketConfig(SocketConfig.custom()
+                .setSoLinger(TimeValue.ofSeconds(-1))
+                .setSoTimeout(Timeout.ofSeconds(10))
+                .build()
+        );
+
+        Thread.ofPlatform().daemon(true).start(() -> {
+            try {
+                for (;;) {
+                    TimeUnit.SECONDS.sleep(15);
+                    logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
+                }
+            }
+            catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        });
+
+        final RequestConfig defaultRequestConfig = RequestConfig.custom()
+                .setCookieSpec(StandardCookieSpec.IGNORE)
+                .setResponseTimeout(10, TimeUnit.SECONDS)
+                .setConnectionRequestTimeout(5, TimeUnit.MINUTES)
+                .build();
+
+        return HttpClients.custom()
+                .setConnectionManager(connectionManager)
+                .setRetryStrategy(new RetryStrategy())
+                .setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
+                    // Default keep-alive duration is 3 minutes, but this is too long for us,
+                    // as we are either going to re-use it fairly quickly or close it for a long time.
+                    //
+                    // So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
+                    private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
+
+                    @Override
+                    public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
+                        final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
+
+                        while (it.hasNext()) {
+                            final HeaderElement he = it.next();
+                            final String param = he.getName();
+                            final String value = he.getValue();
+
+                            if (value == null)
+                                continue;
+                            if (!"timeout".equalsIgnoreCase(param))
+                                continue;
+
+                            try {
+                                long timeout = Long.parseLong(value);
+                                timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
+                                return TimeValue.ofSeconds(timeout);
+                            } catch (final NumberFormatException ignore) {
+                                break;
+                            }
+                        }
+                        return defaultValue;
+                    }
+                })
+                .disableRedirectHandling()
+                .setDefaultRequestConfig(defaultRequestConfig)
+                .build();
+    }
+
+    @Override
+    public HttpClient get() {
+        return client;
+    }
+}
+
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/io/RetryStrategy.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/io/RetryStrategy.java
@@ -0,0 +1,79 @@
+package nu.marginalia.livecrawler.io;
+
+import org.apache.hc.client5.http.HttpHostConnectException;
+import org.apache.hc.client5.http.HttpRequestRetryStrategy;
+import org.apache.hc.core5.http.HttpRequest;
+import org.apache.hc.core5.http.HttpResponse;
+import org.apache.hc.core5.http.protocol.HttpContext;
+import org.apache.hc.core5.util.TimeValue;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.net.ssl.SSLException;
+import java.io.IOException;
+import java.net.SocketException;
+import java.net.SocketTimeoutException;
+import java.net.UnknownHostException;
+
+public class RetryStrategy implements HttpRequestRetryStrategy {
+    private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
+
+    @Override
+    public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
+        return switch (exception) {
+            case SocketTimeoutException ste -> false;
+            case SSLException ssle -> false;
+            case UnknownHostException uhe -> false;
+            case HttpHostConnectException ex -> executionCount < 2;
+            case SocketException ex -> executionCount < 2;
+            default -> executionCount <= 3;
+        };
+    }
+
+    @Override
+    public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
+        return switch (response.getCode()) {
+            case 500, 503 -> executionCount <= 2;
+            case 429 -> executionCount <= 3;
+            default -> false;
+        };
+    }
+
+    @Override
+    public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
+        return TimeValue.ofSeconds(1);
+    }
+
+    @Override
+    public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
+
+        int statusCode = response.getCode();
+
+        // Give 503 a bit more time
+        if (statusCode == 503) return TimeValue.ofSeconds(5);
+
+        if (statusCode == 429) {
+            // get the Retry-After header
+            var retryAfterHeader = response.getFirstHeader("Retry-After");
+            if (retryAfterHeader == null) {
+                return TimeValue.ofSeconds(3);
+            }
+
+            String retryAfter = retryAfterHeader.getValue();
+            if (retryAfter == null) {
+                return TimeValue.ofSeconds(2);
+            }
+
+            try {
+                int retryAfterTime = Integer.parseInt(retryAfter);
+                retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
+
+                return TimeValue.ofSeconds(retryAfterTime);
+            } catch (NumberFormatException e) {
+                logger.warn("Invalid Retry-After header: {}", retryAfter);
+            }
+        }
+
+        return TimeValue.ofSeconds(2);
+    }
+}
--- a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java
+++ b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java
@@ -3,10 +3,13 @@ package nu.marginalia.livecrawler;
 import nu.marginalia.coordination.LocalDomainCoordinator;
 import nu.marginalia.db.DomainBlacklistImpl;
 import nu.marginalia.io.SerializableCrawlDataStream;
+import nu.marginalia.livecrawler.io.HttpClientProvider;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawldata.CrawledDocument;
 import org.apache.commons.io.FileUtils;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.core5.io.CloseMode;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeEach;
@@ -16,29 +19,34 @@ import org.mockito.Mockito;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.security.KeyManagementException;
+import java.security.NoSuchAlgorithmException;
 import java.sql.SQLException;
 import java.util.List;

 class SimpleLinkScraperTest {
    private Path tempDir;
    private LiveCrawlDataSet dataSet;
+    private CloseableHttpClient httpClient;

    @BeforeEach
-    public void setUp() throws IOException, SQLException {
+    public void setUp() throws IOException, SQLException, NoSuchAlgorithmException, KeyManagementException {
        tempDir = Files.createTempDirectory(getClass().getSimpleName());
        dataSet = new LiveCrawlDataSet(tempDir);
+        httpClient = HttpClientProvider.createClient();
    }


    @AfterEach
    public void tearDown() throws Exception {
        dataSet.close();
+        httpClient.close(CloseMode.IMMEDIATE);
        FileUtils.deleteDirectory(tempDir.toFile());
    }

    @Test
    public void testRetrieveNow() throws Exception {
-        var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),  null, Mockito.mock(DomainBlacklistImpl.class));
+        var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),  null, httpClient, Mockito.mock(DomainBlacklistImpl.class));
        int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
        Assertions.assertEquals(1, fetched);

@@ -58,7 +66,7 @@ class SimpleLinkScraperTest {
    @Test
    public void testRetrieveNow_Redundant() throws Exception {
        dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
-        var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),null, Mockito.mock(DomainBlacklistImpl.class));
+        var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),null, httpClient, Mockito.mock(DomainBlacklistImpl.class));

        // If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
        int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
--- a/code/processes/new-domain-process/README.md
+++ b/code/processes/new-domain-process/README.md
@@ -0,0 +1,12 @@
+The new domain process (NDP) is a process that evaluates new domains for
+inclusion in the search engine index.   
+
+It visits the root document of each candidate domain, ensures that it's reachable,
+verifies that the response is valid HTML, and checks for a few factors such as length
+and links before deciding whether to assign the domain to a node.
+
+The NDP process will assign new domains to the node with the fewest assigned domains.
+
+The NDP process is triggered with a goal target number of domains to process, and
+will find domains until that target is reached.  If e.g. a goal of 100 is set,
+and 50 are in the index, it will find 50 more domains.
--- a/code/processes/new-domain-process/build.gradle
+++ b/code/processes/new-domain-process/build.gradle
@@ -32,6 +32,8 @@ dependencies {
    implementation project(':code:libraries:message-queue')
    implementation project(':code:libraries:blocking-thread-pool')
    
+    implementation project(':code:functions:link-graph:api')
+
    implementation project(':code:processes:process-mq-api')
    implementation project(':code:processes:crawling-process:ft-content-type')
    implementation project(':code:processes:crawling-process:ft-link-parser')
--- a/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainNodeAllocator.java
+++ b/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainNodeAllocator.java
@@ -91,6 +91,9 @@ public class DomainNodeAllocator {
        for (var node : nodeConfigurationService.getAll()) {
            if (node.disabled())
                continue;
+            if (!node.autoAssignDomains())
+                continue;
+
            if (node.profile().permitBatchCrawl())
                viableNodes.add(node.node());
        }
--- a/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainTestingQueue.java
+++ b/code/processes/new-domain-process/java/nu/marginalia/ndp/DomainTestingQueue.java
@@ -2,11 +2,17 @@ package nu.marginalia.ndp;

 import com.google.inject.Inject;
 import com.zaxxer.hikari.HikariDataSource;
+import it.unimi.dsi.fastutil.ints.Int2IntMap;
+import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
+import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
+import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
 import nu.marginalia.ndp.model.DomainToTest;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.sql.Connection;
+import java.sql.ResultSet;
+import java.time.Duration;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.ArrayBlockingQueue;
@@ -22,11 +28,15 @@ public class DomainTestingQueue {
    private final ConcurrentHashMap<String, Boolean> takenDomains = new ConcurrentHashMap<>();

    private final HikariDataSource dataSource;
+    private final AggregateLinkGraphClient linkGraphClient;


    @Inject
-    public DomainTestingQueue(HikariDataSource dataSource) {
+    public DomainTestingQueue(HikariDataSource dataSource,
+                              AggregateLinkGraphClient linkGraphClient
+                              ) {
        this.dataSource = dataSource;
+        this.linkGraphClient = linkGraphClient;

        Thread.ofPlatform()
                .name("DomainTestingQueue::fetch()")
@@ -44,9 +54,10 @@ public class DomainTestingQueue {
                SET STATE='ACCEPTED'
                WHERE DOMAIN_ID=?
                """);
-             var assigNodeStmt = conn.prepareStatement("""
+             var assignNodeStmt = conn.prepareStatement("""
                UPDATE EC_DOMAIN SET NODE_AFFINITY=?
                WHERE ID=?
+                AND EC_DOMAIN.NODE_AFFINITY < 0
                """)
             )
        {
@@ -54,9 +65,9 @@ public class DomainTestingQueue {
            flagOkStmt.setInt(1, domain.domainId());
            flagOkStmt.executeUpdate();

-            assigNodeStmt.setInt(1, nodeId);
-            assigNodeStmt.setInt(2, domain.domainId());
-            assigNodeStmt.executeUpdate();
+            assignNodeStmt.setInt(1, nodeId);
+            assignNodeStmt.setInt(2, domain.domainId());
+            assignNodeStmt.executeUpdate();
            conn.commit();
        } catch (Exception e) {
            throw new RuntimeException("Failed to accept domain in database", e);
@@ -106,9 +117,14 @@ public class DomainTestingQueue {
                }

                if (domains.isEmpty()) {
-                    refreshQueue(conn);
+                    if (!refreshQueue(conn)) {
+                        throw new RuntimeException("No new domains found, aborting!");
+                    }
                }
            }
+            catch (RuntimeException e) {
+                throw e; // Rethrow runtime exceptions to avoid wrapping them in another runtime exception
+            }
            catch (Exception e) {
                throw new RuntimeException("Failed to fetch domains from database", e);
            }
@@ -125,25 +141,100 @@ public class DomainTestingQueue {
        }
    }

-    private void refreshQueue(Connection conn) {
+    private boolean refreshQueue(Connection conn) {
        logger.info("Refreshing domain queue in database");
-        try (var stmt = conn.createStatement()) {
-            conn.setAutoCommit(false);
-            logger.info("Revitalizing rejected domains");

-            // Revitalize rejected domains
-            stmt.executeUpdate("""
-                UPDATE NDP_NEW_DOMAINS
-                SET STATE='NEW'
-                WHERE NDP_NEW_DOMAINS.STATE = 'REJECTED'
-                AND DATE_ADD(TS_CHANGE, INTERVAL CHECK_COUNT DAY) > NOW()
-                """);
-            conn.commit();
+        Int2IntMap domainIdToCount = new Int2IntOpenHashMap();
+
+        // Load known domain IDs from the database to avoid inserting duplicates from NDP_NEW_DOMAINS
+        // or domains that are already assigned to a node
+        {
+            IntOpenHashSet knownIds = new IntOpenHashSet();
+
+            try (var stmt = conn.createStatement()) {
+                ResultSet rs = stmt.executeQuery("SELECT DOMAIN_ID FROM NDP_NEW_DOMAINS");
+                rs.setFetchSize(10_000);
+                while (rs.next()) {
+                    int domainId = rs.getInt("DOMAIN_ID");
+                    knownIds.add(domainId);
+                }
+
+                rs = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0");
+                rs.setFetchSize(10_000);
+                while (rs.next()) {
+                    int domainId = rs.getInt("ID");
+                    knownIds.add(domainId);
+                }
+            } catch (Exception e) {
+                throw new RuntimeException("Failed to load known domain IDs from database", e);
+            }
+
+            // Ensure the link graph is ready before proceeding.  This is mainly necessary in a cold reboot
+            // of the entire system.
+            try {
+                logger.info("Waiting for link graph client to be ready...");
+                linkGraphClient.waitReady(Duration.ofHours(1));
+                logger.info("Link graph client is ready, fetching domain links...");
+            } catch (InterruptedException e) {
+                throw new RuntimeException(e);
+            }
+
+            // Fetch all domain links from the link graph and count by how many sources each dest domain is linked from
+            var iter = linkGraphClient.getAllDomainLinks().iterator();
+            while (iter.advance()) {
+                int dest = iter.dest();
+                if (!knownIds.contains(dest)) {
+                    domainIdToCount.mergeInt(dest, 1, (i, j) -> i + j);
+                }
+            }
+        }
+
+        boolean didInsert = false;
+
+        /* Insert new domains into NDP_NEW_DOMAINS table */
+        try (var insertStmt = conn.prepareStatement("""
+                INSERT IGNORE INTO NDP_NEW_DOMAINS (DOMAIN_ID, PRIORITY) VALUES (?, ?)
+                """)) {
+            conn.setAutoCommit(false);
+
+            int cnt = 0;
+            for (var entry : domainIdToCount.int2IntEntrySet()) {
+                int domainId = entry.getIntKey();
+                int count = entry.getIntValue();
+
+                insertStmt.setInt(1, domainId);
+                insertStmt.setInt(2, count);
+                insertStmt.addBatch();
+
+                if (++cnt >= 1000) {
+                    cnt = 0;
+                    insertStmt.executeBatch(); // Execute in batches to avoid memory issues
+                    conn.commit();
+                    didInsert = true;
+                }
+            }
+            if (cnt != 0) {
+                insertStmt.executeBatch(); // Execute any remaining batch
+                conn.commit();
+                didInsert = true;
+            }

            logger.info("Queue refreshed successfully");
        } catch (Exception e) {
            throw new RuntimeException("Failed to refresh queue in database", e);
        }
+
+        // Clean up NDP_NEW_DOMAINS table to remove any domains that are already in EC_DOMAIN
+        // This acts not only to clean up domains that we've flagged as ACCEPTED, but also to
+        // repair inconsistent states where domains might have incorrectly been added to NDP_NEW_DOMAINS
+        try (var stmt = conn.createStatement()) {
+            stmt.executeUpdate("DELETE FROM NDP_NEW_DOMAINS WHERE DOMAIN_ID IN (SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0)");
+        }
+        catch (Exception e) {
+            throw new RuntimeException("Failed to clean up NDP_NEW_DOMAINS", e);
+        }
+
+        return didInsert;
    }

 }
--- a/code/processes/readme.md
+++ b/code/processes/readme.md
@@ -28,6 +28,7 @@ the data generated by the loader.
 ## 5. Other Processes

 * Ping Process: The [ping-process](ping-process/) keeps track of the aliveness of websites, gathering fingerprint information about the security posture of the website, as well as DNS information.
+* New Domain Process (NDP): The [new-domain-process](new-domain-process/) evaluates new domains for inclusion in the search engine index.
 * Live-Crawling Process: The [live-crawling-process](live-crawling-process/) is a process that crawls websites in real-time based on RSS feeds, updating a smaller index with the latest content.

 ## Overview 
--- a/code/services-application/status-service/java/nu/marginalia/status/StatusModule.java
+++ b/code/services-application/status-service/java/nu/marginalia/status/StatusModule.java
@@ -20,6 +20,6 @@ public class StatusModule extends AbstractModule {
        bind(String.class)
                .annotatedWith(Names.named("searchEngineTestQuery"))
                .toInstance(System.getProperty("status-service.public-query",
-                        "https://marginalia-search.com/search?query=plato&ref=marginalia-automatic-metrics"));
+                        "https://old-search.marginalia.nu/search?query=plato&ref=marginalia-automatic-metrics"));
    }
 }
--- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java
@@ -280,6 +280,7 @@ public class ControlNodeService {
                    "on".equalsIgnoreCase(request.queryParams("autoClean")),
                    "on".equalsIgnoreCase(request.queryParams("includeInPrecession")),
                    "on".equalsIgnoreCase(request.queryParams("keepWarcs")),
+                    "on".equalsIgnoreCase(request.queryParams("autoAssignDomains")),
                    NodeProfile.valueOf(request.queryParams("profile")),
                    "on".equalsIgnoreCase(request.queryParams("disabled"))
            );
--- a/code/services-core/control-service/java/nu/marginalia/control/sys/svc/ControlSysActionsService.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/sys/svc/ControlSysActionsService.java
@@ -74,6 +74,8 @@ public class ControlSysActionsService {
            Spark.post("/actions/recrawl-all", this::recrawlAll, Redirects.redirectToOverview);
            Spark.post("/actions/flush-api-caches", this::flushApiCaches, Redirects.redirectToOverview);
            Spark.post("/actions/reload-blogs-list", this::reloadBlogsList, Redirects.redirectToOverview);
+
+            Spark.post("/actions/update-nsfw-filters", this::updateNsfwFilters, Redirects.redirectToOverview);
        }
        catch (Exception e) {
            throw new RuntimeException(e);
@@ -132,6 +134,14 @@ public class ControlSysActionsService {
        return "";
    }

+    public Object updateNsfwFilters(Request request, Response response) throws Exception {
+        eventLog.logEvent("USER-ACTION", "UPDATE-NSFW-FILTERS");
+
+        executorClient.updateNsfwFilters();
+
+        return "";
+    }
+
    public Object flushApiCaches(Request request, Response response) throws Exception {
        eventLog.logEvent("USER-ACTION", "FLUSH-API-CACHES");
        apiOutbox.sendNotice("FLUSH_CACHES", "");
--- a/code/services-core/control-service/resources/templates/control/node/node-config.hdb
+++ b/code/services-core/control-service/resources/templates/control/node/node-config.hdb
@@ -66,13 +66,23 @@
                </div>
            </div>

+            <div class="form-check form-switch">
+                <input class="form-check-input" type="checkbox" role="switch" name="autoAssignDomains" {{#if config.autoAssignDomains}}checked{{/if}}>
+                <label class="form-check-label" for="autoClean">Auto-Assign Domains</label>
+
+                <div class="form-text">If true, the New Domain Process will assign new domains to this node and all other nodes with this setting enabled.
+                                       This is the default behavior, but can be overridden if you want one node with a specific manual domain assignment.
+                                       </div>
+            </div>
+
+<!--  This is not currently used, but may be in the future
            <div class="form-check form-switch">
                <input class="form-check-input" type="checkbox" role="switch" name="includeInPrecession" {{#if config.includeInPrecession}}checked{{/if}}>
                <label class="form-check-label" for="includeInPrecession">Include in crawling precession</label>

                <div class="form-text">If true, this node will be included in the crawling precession.</div>
            </div>
-
+-->
            <div class="form-check form-switch">
                <input class="form-check-input" type="checkbox" role="switch" name="keepWarcs" {{#if config.keepWarcs}}checked{{/if}}>
                <label class="form-check-label" for="includeInPrecession">Keep WARC files during crawling</label>
--- a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb
+++ b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb
@@ -13,14 +13,23 @@
    {{#unless node.profile.realtime}}
    <li class="nav-item dropdown">
        <a class="nav-link dropdown-toggle {{#if tab.actions}}active{{/if}}" data-bs-toggle="dropdown" href="#" role="button" aria-expanded="false">Actions</a>
-        {{#if node.profile.permitBatchCrawl}}
        <ul class="dropdown-menu">
+        {{#if node.profile.permitBatchCrawl}}
            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=new-crawl">New Crawl</a></li>
            <li><hr class="dropdown-divider"></li>
            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=process">Process Crawl Data</a></li>
+        {{/if}}
            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=load">Load Processed Data</a></li>
            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=repartition">Repartition Index</a></li>
            <li><hr class="dropdown-divider"></li>
+        {{#if node.profile.permitSideload}}
+            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-encyclopedia">Sideload Encyclopedia</a></li>
+            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-stackexchange">Sideload Stackexchange</a></li>
+            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-warc">Sideload WARC Files</a></li>
+            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-dirtree">Sideload Dirtree</a></li>
+            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-reddit">Sideload Reddit</a></li>
+            <li><hr class="dropdown-divider"></li>
+        {{/if}}
            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=download-sample-data">Download Sample Crawl Data</a></li>
            <li><hr class="dropdown-divider"></li>
            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=export-db-data">Export Database Data</a></li>
@@ -30,19 +39,6 @@
            <li><hr class="dropdown-divider"></li>
            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=restore-backup">Restore Index Backup</a></li>
        </ul>
-        {{/if}}
-        {{#if node.profile.permitSideload}}
-        <ul class="dropdown-menu">
-            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-encyclopedia">Sideload Encyclopedia</a></li>
-            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-stackexchange">Sideload Stackexchange</a></li>
-            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-warc">Sideload WARC Files</a></li>
-            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-dirtree">Sideload Dirtree</a></li>
-            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=sideload-reddit">Sideload Reddit</a></li>
-            <li><hr class="dropdown-divider"></li>
-            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=load">Load Processed Data</a></li>
-            <li><a class="dropdown-item" href="/nodes/{{node.id}}/actions?view=restore-backup">Restore Index Backup</a></li>
-        </ul>
-        {{/if}}
    </li>
    {{/unless}}
    <li class="nav-item">
--- a/code/services-core/control-service/resources/templates/control/sys/sys-actions.hdb
+++ b/code/services-core/control-service/resources/templates/control/sys/sys-actions.hdb
@@ -53,6 +53,31 @@
            </div>
        </div>

+        <div class="accordion-item">
+            <h2 class="accordion-header">
+                <button class="accordion-button collapsed"
+                        type="button"
+                        data-bs-toggle="collapse"
+                        data-bs-target="#collapseNsfwFilters"
+                        aria-expanded="false"
+                        aria-controls="collapseNsfwFilters">
+                    Update NSFW Filters Definitions
+                </button>
+            </h2>
+            <div id="collapseNsfwFilters" class="accordion-collapse collapse p-3" data-bs-parent="#accordionActions">
+                <div class="mb-3">
+                    This will fetch NSFW filter definitions.
+                </div>
+                <form method="post" action="actions/update-nsfw-filters">
+                    <button
+                            class="btn btn-primary me-md-2"
+                            onclick="return confirm('Confirm update NSFW filters');"
+                            type="submit">
+                        Update NSFW Filter</button>
+                </form>
+            </div>
+        </div>
+
        <div class="accordion-item">
            <h2 class="accordion-header">
                <button class="accordion-button collapsed"
Author	SHA1	Message	Date
Viktor Lofgren	294ab19177	(status) Use old-search for status service instead of marginalia-search.com	2025-07-06 15:40:53 +02:00
Viktor Lofgren	6f1659ecb2	(control) Add GUI for NSFW Filter Update trigger	2025-06-25 16:03:27 +02:00
Viktor Lofgren	982dcb28f0	(live-crawler) Use Apache HttpClient + code cleanup	2025-06-24 13:04:19 +02:00
Viktor Lofgren	fc686d8b2e	(live-crawler) Fix startup race condition The fix makes sure we wait for the feeds API to be available before fetching from it, so that the process doesn't crash on a cold system reboot.	2025-06-24 11:42:41 +02:00
Viktor Lofgren	69ef0f334a	(rss) Make feed fetcher use Apache's HttpClient	2025-06-23 18:49:55 +02:00
Viktor Lofgren	446746f3bd	(control) Fix so that sideload actions show up in Mixed profile nodes	2025-06-23 18:08:09 +02:00
Viktor Lofgren	24ab8398bb	(ndp) Use LinkGraphClient to populate NDP table	2025-06-23 16:44:38 +02:00
Viktor Lofgren	d2ceeff4cf	(ndp) Add toggle for excluding nodes from assignment via NDP	2025-06-23 15:38:02 +02:00
Viktor Lofgren	cf64214b1c	(ndp) Update documentation	2025-06-23 15:18:35 +02:00