(crawler) Correct issue leading to parquet files not being correctly preconverted

Path.endsWith("str") != String.endsWith(".str")
(crawler) Add error logging when entering bad path for historical crawl data
2025-10-06 17:32:39 +02:00 · 2025-03-10 13:48:12 +01:00 · 2025-03-10 13:38:40 +01:00 · 2025-03-10 13:12:39 +01:00 · 2025-03-10 13:03:48 +01:00 · 2025-03-09 13:47:51 +01:00
73 changed files with 799 additions and 396 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -5,7 +5,7 @@ plugins {

    // This is a workaround for a bug in the Jib plugin that causes it to stall randomly
    // https://github.com/GoogleContainerTools/jib/issues/3347
-    id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
+    id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
 }

 group 'marginalia'
--- a/code/common/config/java/nu/marginalia/LanguageModels.java
+++ b/code/common/config/java/nu/marginalia/LanguageModels.java
@@ -24,58 +24,4 @@ public class LanguageModels {
        this.fasttextLanguageModel = fasttextLanguageModel;
        this.segments = segments;
    }
-
-    public static LanguageModelsBuilder builder() {
-        return new LanguageModelsBuilder();
-    }
-
-    public static class LanguageModelsBuilder {
-        private Path termFrequencies;
-        private Path openNLPSentenceDetectionData;
-        private Path posRules;
-        private Path posDict;
-        private Path fasttextLanguageModel;
-        private Path segments;
-
-        LanguageModelsBuilder() {
-        }
-
-        public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
-            this.termFrequencies = termFrequencies;
-            return this;
-        }
-
-        public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
-            this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
-            return this;
-        }
-
-        public LanguageModelsBuilder posRules(Path posRules) {
-            this.posRules = posRules;
-            return this;
-        }
-
-        public LanguageModelsBuilder posDict(Path posDict) {
-            this.posDict = posDict;
-            return this;
-        }
-
-        public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
-            this.fasttextLanguageModel = fasttextLanguageModel;
-            return this;
-        }
-
-        public LanguageModelsBuilder segments(Path segments) {
-            this.segments = segments;
-            return this;
-        }
-
-        public LanguageModels build() {
-            return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
-        }
-
-        public String toString() {
-            return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
-        }
-    }
 }
--- a/code/common/service/java/nu/marginalia/process/log/WorkLog.java
+++ b/code/common/service/java/nu/marginalia/process/log/WorkLog.java
@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.time.LocalDateTime;
-import java.util.*;
+import java.util.HashSet;
+import java.util.Optional;
+import java.util.Set;
 import java.util.function.Function;

 /** WorkLog is a journal of work done by a process,
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
        return new WorkLoadIterable<>(logFile, mapper);
    }

+    public static int countEntries(Path crawlerLog) throws IOException{
+        try (var linesStream = Files.lines(crawlerLog)) {
+            return (int) linesStream.filter(WorkLogEntry::isJobId).count();
+        }
+    }
+
    // Use synchro over concurrent set to avoid competing writes
    // - correct is better than fast here, it's sketchy enough to use
    // a PrintWriter
--- a/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java
+++ b/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java
@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import java.io.IOException;
 import java.net.InetAddress;
 import java.net.NetworkInterface;
 import java.util.Enumeration;
@@ -115,11 +116,12 @@ public class ServiceConfigurationModule extends AbstractModule {
        }
    }

-    public static String getLocalNetworkIP() throws Exception {
+    public static String getLocalNetworkIP() throws IOException {
        Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();

        while (nets.hasMoreElements()) {
            NetworkInterface netif = nets.nextElement();
+            logger.info("Considering network interface {}:  Up? {},  Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
            if (!netif.isUp() || netif.isLoopback()) {
                continue;
            }
@@ -127,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
            Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
            while (inetAddresses.hasMoreElements()) {
                InetAddress addr = inetAddresses.nextElement();
+                logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
                if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
                    return addr.getHostAddress();
                }
--- a/code/common/service/java/nu/marginalia/service/server/JoobyService.java
+++ b/code/common/service/java/nu/marginalia/service/server/JoobyService.java
@@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
 import org.slf4j.Marker;
 import org.slf4j.MarkerFactory;

+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.List;
@@ -106,9 +107,12 @@ public class JoobyService {
                config.externalAddress());

        // FIXME:  This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
+        if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
            jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
+        }
+        if (Files.exists(Path.of("/app/resources/static"))) {
            jooby.assets("/*", Paths.get("/app/resources/static"));
-
+        }
        var options = new ServerOptions();
        options.setHost(config.bindAddress());
        options.setPort(restEndpoint.port());
--- a/code/common/service/java/nu/marginalia/service/server/MetricsServer.java
+++ b/code/common/service/java/nu/marginalia/service/server/MetricsServer.java
@@ -6,17 +6,22 @@ import nu.marginalia.service.module.ServiceConfiguration;
 import org.eclipse.jetty.server.Server;
 import org.eclipse.jetty.servlet.ServletContextHandler;
 import org.eclipse.jetty.servlet.ServletHolder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.net.InetSocketAddress;

 public class MetricsServer {

+    private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
+
    @Inject
-    public MetricsServer(ServiceConfiguration configuration) throws Exception {
+    public MetricsServer(ServiceConfiguration configuration) {
        // If less than zero, we forego setting up a metrics server
        if (configuration.metricsPort() < 0)
            return;

+        try {
            Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));

            ServletContextHandler context = new ServletContextHandler();
@@ -25,6 +30,12 @@ public class MetricsServer {

            context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");

+            logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
+
            server.start();
        }
+        catch (Exception|NoSuchMethodError ex) {
+            logger.error("Failed to set up metrics server", ex);
+        }
+    }
 }
--- a/code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java
+++ b/code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java
@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
 import nu.marginalia.nodecfg.NodeConfigurationService;
 import nu.marginalia.nodecfg.model.NodeProfile;
 import nu.marginalia.service.module.ServiceConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.time.Duration;
 import java.time.LocalDateTime;
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {

    private final NodeConfigurationService nodeConfigurationService;
    private final MqPersistence persistence;
+    private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);

    @Inject
    public UpdateRssActor(Gson gson,
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
            case UpdateRefresh(int count, long msgId) -> {
                MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
                if (msg == null) {
-                    // Retry the update
-                    yield new Error("Failed to update feeds: message not found");
+                    logger.warn("UpdateRefresh is taking a very long time");
+                    yield new UpdateRefresh(count, msgId);
                } else if (msg.state() != MqMessageState.OK) {
                    // Retry the update
                    yield new Error("Failed to update feeds: " + msg.state());
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
            case UpdateClean(long msgId) -> {
                MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
                if (msg == null) {
-                    // Retry the update
-                    yield new Error("Failed to update feeds: message not found");
+                    logger.warn("UpdateClean is taking a very long time");
+                    yield new UpdateClean(msgId);
                } else if (msg.state() != MqMessageState.OK) {
                    // Retry the update
                    yield new Error("Failed to update feeds: " + msg.state());
--- a/code/execution/java/nu/marginalia/actor/task/MigrateCrawlDataActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/MigrateCrawlDataActor.java
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorStep;
 import nu.marginalia.io.CrawlerOutputFile;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.process.log.WorkLogEntry;
+import nu.marginalia.service.control.ServiceHeartbeat;
 import nu.marginalia.slop.SlopCrawlDataRecord;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
@@ -18,6 +19,7 @@ import org.slf4j.LoggerFactory;

 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
 import java.util.Map;
 import java.util.Optional;
 import java.util.function.Function;
@@ -26,14 +28,15 @@ import java.util.function.Function;
 public class MigrateCrawlDataActor extends RecordActorPrototype {

    private final FileStorageService fileStorageService;
-
+    private final ServiceHeartbeat serviceHeartbeat;
    private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);

    @Inject
-    public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService) {
+    public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService, ServiceHeartbeat serviceHeartbeat) {
        super(gson);

        this.fileStorageService = fileStorageService;
+        this.serviceHeartbeat = serviceHeartbeat;
    }

    public record Run(long fileStorageId) implements ActorStep {}
@@ -49,33 +52,50 @@ public class MigrateCrawlDataActor extends RecordActorPrototype {
                Path crawlerLog = root.resolve("crawler.log");
                Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");

-                try (WorkLog workLog = new WorkLog(newCrawlerLog)) {
+                int totalEntries = WorkLog.countEntries(crawlerLog);
+
+                try (WorkLog workLog = new WorkLog(newCrawlerLog);
+                     var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Migrating")
+                ) {
+                    int entryIdx = 0;
+
                    for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {

-                        var entry = item.getKey();
-                        var path = item.getValue();
+                        final WorkLogEntry entry = item.getKey();
+                        final Path inputPath = item.getValue();

-                        logger.info("Converting {}", entry.id());
+                        Path outputPath = inputPath;
+                        heartbeat.progress("Migrating" + inputPath.getFileName(), entryIdx++, totalEntries);

-
-                        if (path.toFile().getName().endsWith(".parquet")) {
+                        if (inputPath.toString().endsWith(".parquet")) {
                            String domain = entry.id();
                            String id = Integer.toHexString(domain.hashCode());

-                            Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain);
+                            outputPath = CrawlerOutputFile.createSlopPath(root, id, domain);

-                            SlopCrawlDataRecord.convertFromParquet(path, outputFile);
+                            if (Files.exists(inputPath)) {
+                                try {
+                                    SlopCrawlDataRecord.convertFromParquet(inputPath, outputPath);
+                                    Files.deleteIfExists(inputPath);
+                                } catch (Exception ex) {
+                                    outputPath = inputPath; // don't update the work log on error
+                                    logger.error("Failed to convert " + inputPath, ex);
+                                }
+                            }
+                            else if (!Files.exists(inputPath) && !Files.exists(outputPath)) {
+                                // if the input file is missing, and the output file is missing, we just write the log
+                                // record identical to the old one
+                                outputPath = inputPath;
+                            }
+                        }

-                            workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt());
-                        }
-                        else {
-                            workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
-                        }
+                        // Write a log entry for the (possibly) converted file
+                        workLog.setJobToFinished(entry.id(), outputPath.toString(), entry.cnt());
                    }
                }

                Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
-                Files.move(crawlerLog, oldCrawlerLog);
+                Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
                Files.move(newCrawlerLog, crawlerLog);

                yield new End();
--- a/code/functions/live-capture/build.gradle
+++ b/code/functions/live-capture/build.gradle
@@ -34,6 +34,7 @@ dependencies {
    implementation libs.bundles.slf4j
    implementation libs.commons.lang3
    implementation libs.commons.io
+    implementation libs.wiremock

    implementation libs.prometheus
    implementation libs.guava
--- a/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
+++ b/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
@@ -1,6 +1,7 @@
 package nu.marginalia.livecapture;

 import com.google.gson.Gson;
+import nu.marginalia.WmsaHome;
 import nu.marginalia.model.gson.GsonFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -12,6 +13,7 @@ import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
 import java.time.Duration;
 import java.util.Map;
+import java.util.Optional;

 /** Client for local browserless.io API */
 public class BrowserlessClient implements AutoCloseable {
@@ -27,13 +29,16 @@ public class BrowserlessClient implements AutoCloseable {
    private final URI browserlessURI;
    private final Gson gson = GsonFactory.get();

+    private final String userAgent = WmsaHome.getUserAgent().uaString();
+
    public BrowserlessClient(URI browserlessURI) {
        this.browserlessURI = browserlessURI;
    }

-    public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
+    public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
        Map<String, Object> requestData = Map.of(
                "url", url,
+                "userAgent", userAgent,
                "gotoOptions", gotoOptions
        );

@@ -49,10 +54,10 @@ public class BrowserlessClient implements AutoCloseable {

        if (rsp.statusCode() >= 300) {
            logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
-            return null;
+            return Optional.empty();
        }

-        return rsp.body();
+        return Optional.of(rsp.body());
    }

    public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
@@ -60,6 +65,7 @@ public class BrowserlessClient implements AutoCloseable {

        Map<String, Object> requestData = Map.of(
                "url", url,
+                "userAgent", userAgent,
                "options", screenshotOptions,
                "gotoOptions", gotoOptions
        );
@@ -84,7 +90,7 @@ public class BrowserlessClient implements AutoCloseable {
    }

    @Override
-    public void close() throws Exception {
+    public void close() {
        httpClient.shutdownNow();
    }

--- a/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java
+++ b/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java
@@ -1,5 +1,9 @@
 package nu.marginalia.livecapture;

+import com.github.tomakehurst.wiremock.WireMockServer;
+import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
+import nu.marginalia.WmsaHome;
+import nu.marginalia.service.module.ServiceConfigurationModule;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Tag;
@@ -8,34 +12,86 @@ import org.testcontainers.containers.GenericContainer;
 import org.testcontainers.junit.jupiter.Testcontainers;
 import org.testcontainers.utility.DockerImageName;

+import java.io.IOException;
 import java.net.URI;
 import java.util.Map;

+import static com.github.tomakehurst.wiremock.client.WireMock.*;
+
+
@Testcontainers
@Tag("slow")
 public class BrowserlessClientTest {
    static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
            .withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
+            .withNetworkMode("bridge")
            .withExposedPorts(3000);

+    static WireMockServer wireMockServer =
+            new WireMockServer(WireMockConfiguration.wireMockConfig()
+                    .port(18089));
+
+    static String localIp;
+
+    static URI browserlessURI;
+
    @BeforeAll
-    public static void setup() {
+    public static void setup() throws IOException {
        container.start();
+
+        browserlessURI = URI.create(String.format("http://%s:%d/",
+                container.getHost(),
+                container.getMappedPort(3000))
+        );
+
+        wireMockServer.start();
+        wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
+
+        localIp = ServiceConfigurationModule.getLocalNetworkIP();
+
+    }
+
+    @Tag("flaky")
+    @Test
+    public void testInspectContentUA__Flaky() throws Exception {
+        try (var client = new BrowserlessClient(browserlessURI)) {
+            client.content("http://" + localIp + ":18089/",
+                    BrowserlessClient.GotoOptions.defaultValues()
+            );
+        }
+
+        wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
+    }
+
+    @Tag("flaky")
+    @Test
+    public void testInspectScreenshotUA__Flaky() throws Exception {
+        try (var client = new BrowserlessClient(browserlessURI)) {
+            client.screenshot("http://" + localIp + ":18089/",
+                    BrowserlessClient.GotoOptions.defaultValues(),
+                    BrowserlessClient.ScreenshotOptions.defaultValues()
+            );
+        }
+
+        wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
    }

    @Test
    public void testContent() throws Exception {
-        try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
-            var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues());
-            Assertions.assertNotNull(content, "Content should not be null");
+        try (var client = new BrowserlessClient(browserlessURI)) {
+            var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
+
            Assertions.assertFalse(content.isBlank(), "Content should not be empty");
        }
    }

    @Test
    public void testScreenshot() throws Exception {
-        try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
-            var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
+        try (var client = new BrowserlessClient(browserlessURI)) {
+            var screenshot = client.screenshot("https://www.marginalia.nu/",
+                    BrowserlessClient.GotoOptions.defaultValues(),
+                    BrowserlessClient.ScreenshotOptions.defaultValues());
+
            Assertions.assertNotNull(screenshot, "Screenshot should not be null");
        }
    }
--- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java
+++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java
@@ -134,6 +134,10 @@ public class QueryExpansion {
                if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
                    graph.addVariantForSpan(prev, qw, joinedWord);
                }
+                else if (StringUtils.isAlpha(prev.word()) && StringUtils.isNumeric(qw.word())) { // join e.g. trs 80 to trs80 and trs-80
+                    graph.addVariantForSpan(prev, qw, prev.word() + qw.word());
+                    graph.addVariantForSpan(prev, qw, prev.word() + "-" + qw.word());
+                }
            }

            prev = qw;
--- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java
+++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java
@@ -213,6 +213,18 @@ public class QueryFactoryTest {
        System.out.println(subquery);
    }

+
+    @Test
+    public void testContractionWordNum() {
+        var subquery = parseAndGetSpecs("glove 80");
+
+        Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove "));
+        Assertions.assertTrue(subquery.query.compiledQuery.contains(" 80 "));
+        Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove-80 "));
+        Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove80 "));
+    }
+
+
    @Test
    public void testCplusPlus() {
        var subquery = parseAndGetSpecs("std::vector::push_back vector");
--- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
+++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
@@ -155,8 +155,15 @@ public class SentenceExtractor {
    public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
        String[] sentences;

-        // Normalize spaces
+        // Safety net against malformed data DOS attacks,
+        // found 5+ MB <p>-tags in the wild that just break
+        // the sentence extractor causing it to stall forever.
+        if (text.length() > 50_000) {
+            // 50k chars can hold a small novel, let alone single html tags
+            text = text.substring(0, 50_000);
+        }

+        // Normalize spaces
        text = normalizeSpaces(text);

        // Split into sentences
--- a/code/libraries/message-queue/java/nu/marginalia/actor/prototype/RecordActorPrototype.java
+++ b/code/libraries/message-queue/java/nu/marginalia/actor/prototype/RecordActorPrototype.java
@@ -5,9 +5,7 @@ import nu.marginalia.actor.state.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
+import java.util.*;

 public abstract class RecordActorPrototype implements ActorPrototype {

@@ -118,7 +116,7 @@ public abstract class RecordActorPrototype implements ActorPrototype {
        }

        private String functionName(Class<? extends ActorStep> functionClass) {
-            return functionClass.getSimpleName().toUpperCase();
+            return ActorStep.functionName(functionClass);
        }

        private ActorStep constructState(String message) throws ReflectiveOperationException {
@@ -145,4 +143,43 @@ public abstract class RecordActorPrototype implements ActorPrototype {
        }
    }

+    /** Get a list of JSON prototypes for each actor step declared by this actor */
+    @SuppressWarnings("unchecked")
+    public Map<String, String> getMessagePrototypes() {
+        Map<String, String> messagePrototypes = new HashMap<>();
+
+        for (var clazz : getClass().getDeclaredClasses()) {
+            if (!clazz.isRecord() || !ActorStep.class.isAssignableFrom(clazz))
+                continue;
+
+            StringJoiner sj = new StringJoiner(",\n\t", "{\n\t", "\n}");
+
+            renderToJsonPrototype(sj, (Class<? extends Record>) clazz);
+
+            messagePrototypes.put(ActorStep.functionName((Class<? extends ActorStep>) clazz), sj.toString());
+        }
+
+        return messagePrototypes;
+    }
+
+    @SuppressWarnings("unchecked")
+    private void renderToJsonPrototype(StringJoiner sj, Class<? extends Record> recordType) {
+        for (var field : recordType.getDeclaredFields()) {
+            String typeName = field.getType().getSimpleName();
+
+            if ("List".equals(typeName)) {
+                sj.add(String.format("\"%s\": [ ]", field.getName()));
+            }
+            else if (field.getType().isRecord()) {
+                var innerSj = new StringJoiner(",", "{", "}");
+                renderToJsonPrototype(innerSj, (Class<? extends Record>) field.getType());
+                sj.add(String.format("\"%s\": %s", field.getName(), sj));
+            }
+            else {
+                sj.add(String.format("\"%s\": \"%s\"", field.getName(), typeName));
+            }
+        }
+
+    }
+
 }
--- a/code/libraries/message-queue/java/nu/marginalia/actor/state/ActorStep.java
+++ b/code/libraries/message-queue/java/nu/marginalia/actor/state/ActorStep.java
@@ -1,3 +1,7 @@
 package nu.marginalia.actor.state;

-public interface ActorStep {}
+public interface ActorStep {
+    static String functionName(Class<? extends ActorStep> type) {
+        return type.getSimpleName().toUpperCase();
+    }
+}
--- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
@@ -12,6 +12,7 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory;
 import nu.marginalia.converting.writer.ConverterBatchWritableIf;
 import nu.marginalia.converting.writer.ConverterBatchWriter;
 import nu.marginalia.converting.writer.ConverterWriter;
+import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.mq.MessageQueueFactory;
 import nu.marginalia.mqapi.converting.ConvertRequest;
 import nu.marginalia.process.ProcessConfiguration;
@@ -34,6 +35,7 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Optional;
@@ -49,6 +51,7 @@ public class ConverterMain extends ProcessMainClass {
    private final ProcessHeartbeat heartbeat;
    private final FileStorageService fileStorageService;
    private final SideloadSourceFactory sideloadSourceFactory;
+    private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);

    public static void main(String... args) throws Exception {

@@ -199,12 +202,26 @@ public class ConverterMain extends ProcessMainClass {
            processedDomains.set(batchingWorkLog.size());
            heartbeat.setProgress(processedDomains.get() / (double) totalDomains);

-            for (var domain : WorkLog.iterableMap(crawlDir.getLogFile(),
+            logger.info("Processing small items");
+
+            // We separate the large and small domains to reduce the number of critical sections,
+            // as the large domains have a separate processing track that doesn't store everything
+            // in memory
+
+            final List<Path> bigTasks = new ArrayList<>();
+
+            // First process the small items
+            for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
                    new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
            {
+                if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
+                    bigTasks.add(dataPath);
+                    continue;
+                }
+
                pool.submit(() -> {
-                    try {
-                        ConverterBatchWritableIf writable = processor.createWritable(domain);
+                    try (var dataStream = SerializableCrawlDataStream.openDataStream(dataPath)) {
+                        ConverterBatchWritableIf writable = processor.fullProcessing(dataStream) ;
                        converterWriter.accept(writable);
                    }
                    catch (Exception ex) {
@@ -223,6 +240,35 @@ public class ConverterMain extends ProcessMainClass {
            do {
                System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining");
            } while (!pool.awaitTermination(60, TimeUnit.SECONDS));
+
+            logger.info("Processing large items");
+
+            try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
+                int bigTaskIdx = 0;
+                // Next the big items domain-by-domain
+                for (var dataPath : bigTasks) {
+                    hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size());
+
+                    try {
+                        // SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
+                        // closed before it's consumed by the converterWriter.  Instead, the converterWriter guarantees it
+                        // will close it after it's consumed.
+
+                        var stream = SerializableCrawlDataStream.openDataStream(dataPath);
+                        ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath));
+
+                        converterWriter.accept(writable);
+                    }
+                    catch (Exception ex) {
+                        logger.info("Error in processing", ex);
+                    }
+                    finally {
+                        heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
+                    }
+                }
+            }
+
+            logger.info("Processing complete");
        }
    }

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java
@@ -14,7 +14,6 @@ import nu.marginalia.converting.writer.ConverterBatchWritableIf;
 import nu.marginalia.converting.writer.ConverterBatchWriter;
 import nu.marginalia.geoip.GeoIpDictionary;
 import nu.marginalia.geoip.sources.AsnTable;
-import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.crawl.DomainIndexingState;
@@ -28,13 +27,11 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.IOException;
-import java.nio.file.Path;
 import java.sql.SQLException;
 import java.util.*;
 import java.util.regex.Pattern;

 public class DomainProcessor {
-    private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
    private final DocumentProcessor documentProcessor;
    private final SiteWords siteWords;
    private final AnchorTagsSource anchorTagsSource;
@@ -56,21 +53,6 @@ public class DomainProcessor {
        geoIpDictionary.waitReady();
    }

-    public ConverterBatchWritableIf createWritable(Path path) throws IOException {
-
-        var dataStream = CrawledDomainReader.createDataStream(path);
-
-        final int sizeHint = dataStream.sizeHint();
-
-        if (sizeHint > SIDELOAD_THRESHOLD) {
-            // If the file is too big, we run a processing mode that doesn't
-            // require loading the entire dataset into RAM
-            return simpleProcessing(dataStream, sizeHint);
-        }
-
-        return fullProcessing(dataStream);
-    }
-
    public SimpleProcessing simpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
        try {
            return new SimpleProcessing(dataStream, sizeHint, extraKeywords);
@@ -159,6 +141,7 @@ public class DomainProcessor {
        private final Set<String> processedUrls = new HashSet<>();
        private final DomainLinks externalDomainLinks;
        private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator();
+
        private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8,
                Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors())
        );
@@ -194,6 +177,7 @@ public class DomainProcessor {
        @Override
        public Iterator<ProcessedDocument> getDocumentsStream() {
            return iteratorFactory.create((taskConsumer) -> {
+
                while (dataStream.hasNext())
                {
                    if (!(dataStream.next() instanceof CrawledDocument doc))
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java
@@ -116,7 +116,7 @@ public class AdblockSimulator {


    // Refrain from cleaning up this code, it's very hot code and needs to be fast.
-    // This version is about 100x faster than the a "clean" first stab implementation.
+    // This version is about 100x faster than a "clean" first stab implementation.

    class RuleVisitor implements NodeFilter {
        public boolean sawAds;
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java
@@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor {

        var tags = doc.select("meta[name=generator]");

-        if (tags.size() == 0) {
+        if (tags.isEmpty()) {
            // Some sites have a comment in the head instead of a meta tag
            return fingerprintServerTech(doc, responseHeaders);
        }
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java
@@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
        }
        fullHtml.append("</div></body></html>");

-        var doc = sideloaderProcessing
+        return sideloaderProcessing
                .processDocument(fullUrl,
                        fullHtml.toString(),
                        List.of("encyclopedia", "wiki"),
@@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
                        anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
                        LocalDate.now().getYear(),
                        10_000_000);
-
-        return doc;
    }

    private String normalizeUtf8(String url) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterWriter.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterWriter.java
@@ -39,6 +39,9 @@ public class ConverterWriter implements AutoCloseable {
        workerThread.start();
    }

+    /** Queue and eventually write the domain into the converter journal
+     *  The domain object will be closed after it's processed.
+     * */
    public void accept(@Nullable ConverterBatchWritableIf domain) {
        if (null == domain)
            return;
@@ -72,15 +75,15 @@ public class ConverterWriter implements AutoCloseable {

                if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) {
                    logger.warn("Skipping already logged item {}", id);
+                }
+                else {
+                    currentWriter.write(data);
+                    workLog.logItem(id);
                    data.close();
-                    continue;
                }

-                currentWriter.write(data);
-
-                workLog.logItem(id);
-
                switcher.tick();
+                data.close();
            }
        }
        catch (Exception ex) {
--- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java
+++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java
@@ -11,7 +11,6 @@ import nu.marginalia.slop.column.primitive.IntColumn;
 import nu.marginalia.slop.column.primitive.LongColumn;
 import nu.marginalia.slop.column.string.EnumColumn;
 import nu.marginalia.slop.column.string.StringColumn;
-import nu.marginalia.slop.column.string.TxtStringColumn;
 import nu.marginalia.slop.desc.StorageType;
 import org.jetbrains.annotations.Nullable;

@@ -182,8 +181,8 @@ public record SlopDocumentRecord(
    }

    // Basic information
-    private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
-    private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
+    private static final StringColumn domainsColumn = new StringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
+    private static final StringColumn urlsColumn = new StringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
    private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
    private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
    private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
@@ -211,7 +210,7 @@ public record SlopDocumentRecord(
    private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);

    public static class KeywordsProjectionReader extends SlopTable {
-        private final TxtStringColumn.Reader domainsReader;
+        private final StringColumn.Reader domainsReader;
        private final VarintColumn.Reader ordinalsReader;
        private final IntColumn.Reader htmlFeaturesReader;
        private final LongColumn.Reader domainMetadataReader;
@@ -275,8 +274,8 @@ public record SlopDocumentRecord(
    }

    public static class MetadataReader extends SlopTable {
-        private final TxtStringColumn.Reader domainsReader;
-        private final TxtStringColumn.Reader urlsReader;
+        private final StringColumn.Reader domainsReader;
+        private final StringColumn.Reader urlsReader;
        private final VarintColumn.Reader ordinalsReader;
        private final StringColumn.Reader titlesReader;
        private final StringColumn.Reader descriptionsReader;
@@ -332,8 +331,8 @@ public record SlopDocumentRecord(
    }

    public static class Writer extends SlopTable {
-        private final TxtStringColumn.Writer domainsWriter;
-        private final TxtStringColumn.Writer urlsWriter;
+        private final StringColumn.Writer domainsWriter;
+        private final StringColumn.Writer urlsWriter;
        private final VarintColumn.Writer ordinalsWriter;
        private final EnumColumn.Writer statesWriter;
        private final StringColumn.Writer stateReasonsWriter;
--- a/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java
+++ b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java
@@ -26,7 +26,7 @@ public class DocumentBodyToString {
        return new String(data, charset);
    }

-    public static Document getParsedData(ContentType type, byte[] data, String url) throws IOException {
+    public static Document getParsedData(ContentType type, byte[] data, int maxLength, String url) throws IOException {
        final Charset charset;

        if (type.charset() == null || type.charset().isBlank()) {
@@ -35,7 +35,7 @@ public class DocumentBodyToString {
            charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
        }

-        ByteArrayInputStream bais = new ByteArrayInputStream(data);
+        ByteArrayInputStream bais = new ByteArrayInputStream(data, 0, Math.min(data.length, maxLength));

        return Jsoup.parse(bais, charset.name(), url);
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -19,7 +19,6 @@ import nu.marginalia.crawl.retreival.DomainProber;
 import nu.marginalia.crawl.warc.WarcArchiverFactory;
 import nu.marginalia.crawl.warc.WarcArchiverIf;
 import nu.marginalia.db.DomainBlacklist;
-import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.io.CrawlerOutputFile;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.mq.MessageQueueFactory;
@@ -42,10 +41,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.security.Security;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -249,22 +245,47 @@ public class CrawlerMain extends ProcessMainClass {
            // (this happens when the process is restarted after a crash or a shutdown)
            tasksDone.set(workLog.countFinishedJobs());

+            // List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
+            // merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
+            // this will more aggressively attempt to schedule the jobs to avoid blocking
+            List<CrawlTask> deferredTasks = new LinkedList<>();
+
            // Create crawl tasks and submit them to the pool for execution
            for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
                if (workLog.isJobFinished(crawlSpec.domain()))
                    continue;

-                var task = new CrawlTask(
+                // Add to the end of the deferral list
+                deferredTasks.addLast(new CrawlTask(
                        crawlSpec,
                        anchorTagsSource,
                        outputDir,
                        warcArchiver,
                        domainStateDb,
-                        workLog);
+                        workLog));

-                if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
-                    pool.submitQuietly(task);
+                // Start every task we currently can from the deferral list
+                deferredTasks.removeIf(task -> {
+                    if (task.canRun()) {
+                        if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
+                            return true; // task has already run, duplicate in crawl specs
                        }
+
+                        // This blocks the caller when the pool is full
+                        pool.submitQuietly(task);
+                        return true;
+                    }
+
+                    return false;
+                });
+            }
+
+            // Schedule any lingering tasks for immediate execution
+            for (var task : deferredTasks) {
+                if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
+                    continue;
+
+                pool.submitQuietly(task);
            }

            logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -347,6 +368,12 @@ public class CrawlerMain extends ProcessMainClass {
            this.id = Integer.toHexString(domain.hashCode());
        }

+        /** Best effort indicator whether we could start this now without getting stuck in
+         * DomainLocks purgatory */
+        public boolean canRun() {
+            return domainLocks.canLock(new EdgeDomain(domain));
+        }
+
        @Override
        public void run() throws Exception {

@@ -417,13 +444,13 @@ public class CrawlerMain extends ProcessMainClass {
            try {
                Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
                if (Files.exists(slopPath)) {
-                    return new CrawlDataReference(CrawledDomainReader.createDataStream(slopPath));
+                    return new CrawlDataReference(slopPath);
                }

                Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
                if (Files.exists(parquetPath)) {
                    slopPath = migrateParquetData(parquetPath, domain, outputDir);
-                    return new CrawlDataReference(CrawledDomainReader.createDataStream(slopPath));
+                    return new CrawlDataReference(slopPath);
                }

            } catch (IOException e) {
@@ -495,7 +522,7 @@ public class CrawlerMain extends ProcessMainClass {
    //
    // This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
    private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
-        if (!inputPath.endsWith(".parquet")) {
+        if (!inputPath.toString().endsWith(".parquet")) {
            return inputPath;
        }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java
@@ -9,6 +9,7 @@ import java.sql.Connection;
 import java.sql.DriverManager;
 import java.sql.SQLException;
 import java.time.Instant;
+import java.util.Objects;
 import java.util.Optional;

 /** Supplemental sqlite database for storing the summary of a crawl.
@@ -99,7 +100,7 @@ public class DomainStateDb implements AutoCloseable {
                       VALUES(?, ?, ?)
            """)) {
            stmt.setString(1, domain);
-            stmt.setString(2, faviconRecord.contentType);
+            stmt.setString(2, Objects.requireNonNullElse(faviconRecord.contentType, "application/octet-stream"));
            stmt.setBytes(3, faviconRecord.imageData);
            stmt.executeUpdate();
        }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
    private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();

    private final Duration requestTimeout = Duration.ofSeconds(10);
+    private final Duration probeTimeout = Duration.ofSeconds(30);

    @Override
    public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@@ -107,12 +108,13 @@ public class HttpFetcherImpl implements HttpFetcher {
                    .HEAD()
                    .uri(url.asURI())
                    .header("User-agent", userAgentString)
-                    .timeout(requestTimeout)
+                    .timeout(probeTimeout)
                    .build();
        } catch (URISyntaxException e) {
            return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
        }

+        for (int tries = 0;; tries++) {
            try {
                var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
                EdgeUrl rspUri = new EdgeUrl(rsp.uri());
@@ -121,10 +123,13 @@ public class HttpFetcherImpl implements HttpFetcher {
                    return new DomainProbeResult.Redirect(rspUri.domain);
                }
                return new DomainProbeResult.Ok(rspUri);
-        }
-        catch (Exception ex) {
+            } catch (Exception ex) {
+                if (tries > 3) {
                    return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
                }
+                // else try again ...
+            }
+        }
    }

    /** Perform a HEAD request to fetch the content type of a URL.
@@ -143,7 +148,7 @@ public class HttpFetcherImpl implements HttpFetcher {
                var headBuilder = HttpRequest.newBuilder()
                    .HEAD()
                    .uri(url.asURI())
-                    .header("User-agent", userAgentString)
+                    .header("User-Agent", userAgentString)
                    .header("Accept-Encoding", "gzip")
                    .timeout(requestTimeout)
                    ;
@@ -215,7 +220,7 @@ public class HttpFetcherImpl implements HttpFetcher {
        var getBuilder = HttpRequest.newBuilder()
                .GET()
                .uri(url.asURI())
-                .header("User-agent", userAgentString)
+                .header("User-Agent", userAgentString)
                .header("Accept-Encoding", "gzip")
                .header("Accept-Language", "en,*;q=0.5")
                .header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
@@ -246,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
        return new SitemapRetriever();
    }

+    /** Recursively fetch sitemaps */
    @Override
    public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
        try {
@@ -265,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
            while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
                var head = sitemapQueue.removeFirst();

-                switch (fetchSitemap(head)) {
+                switch (fetchSingleSitemap(head)) {
                    case SitemapResult.SitemapUrls(List<String> urls) -> {

                        for (var url : urls) {
@@ -301,13 +307,13 @@ public class HttpFetcherImpl implements HttpFetcher {
    }


-    private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
+    private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
        HttpRequest getRequest = HttpRequest.newBuilder()
                .GET()
                .uri(sitemapUrl.asURI())
                .header("Accept-Encoding", "gzip")
                .header("Accept", "text/*, */*;q=0.9")
-                .header("User-agent", userAgentString)
+                .header("User-Agent", userAgentString)
                .timeout(requestTimeout)
                .build();

@@ -386,7 +392,7 @@ public class HttpFetcherImpl implements HttpFetcher {
                    .uri(url.asURI())
                    .header("Accept-Encoding", "gzip")
                    .header("Accept", "text/*, */*;q=0.9")
-                    .header("User-agent", userAgentString)
+                    .header("User-Agent", userAgentString)
                    .timeout(requestTimeout);

            HttpFetchResult result = recorder.fetch(client, getRequest.build());
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
@@ -44,6 +44,14 @@ public class DomainLocks {
        return new Semaphore(2);
    }

+    public boolean canLock(EdgeDomain domain) {
+        Semaphore sem = locks.get(domain.topDomain.toLowerCase());
+        if (null == sem)
+            return true;
+        else
+            return sem.availablePermits() > 0;
+    }
+
    public static class DomainLock implements AutoCloseable {
        private final String domainName;
        private final Semaphore semaphore;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@@ -4,6 +4,7 @@ import nu.marginalia.ContentTypes;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.lsh.EasyLSH;
 import nu.marginalia.model.crawldata.CrawledDocument;
+import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@@ -11,51 +12,73 @@ import javax.annotation.Nullable;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.Objects;
+import java.util.Optional;

 /** A reference to a domain that has been crawled before. */
-public class CrawlDataReference implements AutoCloseable {
+public class CrawlDataReference implements AutoCloseable, Iterable<CrawledDocument> {
+
+    private boolean closed = false;
+
+    @Nullable
+    private final Path path;
+
+    @Nullable
+    private SerializableCrawlDataStream data = null;

-    private final SerializableCrawlDataStream data;
    private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);

-    public CrawlDataReference(SerializableCrawlDataStream data) {
-        this.data = data;
+    public CrawlDataReference(@Nullable Path path) {
+        this.path = path;
    }

    public CrawlDataReference() {
-        this(SerializableCrawlDataStream.empty());
+        this(null);
    }

    /** Delete the associated data from disk, if it exists */
    public void delete() throws IOException {
-        Path filePath = data.path();
-
-        if (filePath != null) {
-            Files.deleteIfExists(filePath);
+        if (path != null) {
+            Files.deleteIfExists(path);
        }
    }

-    /** Get the next document from the crawl data,
-     * returning null when there are no more documents
-     * available
-     */
-    @Nullable
-    public CrawledDocument nextDocument() {
+    public @NotNull Iterator<CrawledDocument> iterator() {
+
+        requireStream();
+        // Guaranteed by requireStream, but helps java
+        Objects.requireNonNull(data);
+
+        return data.map(next -> {
+            if (next instanceof CrawledDocument doc && ContentTypes.isAccepted(doc.contentType)) {
+                return Optional.of(doc);
+            }
+            else {
+                return Optional.empty();
+            }
+        });
+    }
+
+    /** After calling this method, data is guaranteed to be non-null */
+    private void requireStream() {
+        if (closed) {
+            throw new IllegalStateException("Use after close()");
+        }
+
+        if (data == null) {
            try {
-            while (data.hasNext()) {
-                if (data.next() instanceof CrawledDocument doc) {
-                    if (!ContentTypes.isAccepted(doc.contentType))
-                        continue;
-
-                    return doc;
+                if (path != null) {
+                    data = SerializableCrawlDataStream.openDataStream(path);
+                    return;
                }
            }
-        }
-        catch (IOException ex) {
-            logger.error("Failed to read next document", ex);
+            catch (Exception ex) {
+                logger.error("Failed to open stream", ex);
            }

-        return null;
+            data = SerializableCrawlDataStream.empty();
+        }
    }

    public static boolean isContentBodySame(byte[] one, byte[] other) {
@@ -98,7 +121,12 @@ public class CrawlDataReference implements AutoCloseable {
    }

    @Override
-    public void close() throws Exception {
+    public void close() throws IOException {
+        if (!closed) {
+            if (data != null) {
                data.close();
            }
+            closed = true;
+        }
+    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -89,47 +89,23 @@ public class CrawlerRetreiver implements AutoCloseable {
    }

    public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
-        try {
+        try (oldCrawlData) {
            // Do an initial domain probe to determine the root URL
-            EdgeUrl rootUrl;
-
            var probeResult = probeRootUrl();
-            switch (probeResult) {
+
+            return switch (probeResult) {
                case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
-                    rootUrl = probedUrl; // Good track
-                }
-                case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
-                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
-                    return 1;
-                }
-                case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
-                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
-                    return 1;
-                }
-            }

                    // Sleep after the initial probe, we don't have access to the robots.txt yet
                    // so we don't know the crawl delay
                    TimeUnit.SECONDS.sleep(1);

-            return crawlDomain(oldCrawlData, rootUrl, domainLinks);
-        }
-        catch (Exception ex) {
-            logger.error("Error crawling domain {}", domain, ex);
-            return 0;
-        }
-    }
-
-    private int crawlDomain(CrawlDataReference oldCrawlData,
-                            EdgeUrl rootUrl,
-                            DomainLinks domainLinks) throws InterruptedException {
-
-        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
+                    final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
                    final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());

                    delayTimer.waitFetchDelay(0); // initial delay after robots.txt

-        DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer);
+                    DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
                    domainStateDb.save(summaryRecord);

                    // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
@@ -138,10 +114,36 @@ public class CrawlerRetreiver implements AutoCloseable {
                        crawlFrontier.increaseDepth(1.5, 2500);
                    }

+                    oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
+
+                    yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
+                }
+                case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
+                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
+                    yield 1;
+                }
+                case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
+                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
+                    yield 1;
+                }
+            };
+
+        }
+        catch (Exception ex) {
+            logger.error("Error crawling domain {}", domain, ex);
+            return 0;
+        }
+    }
+
+    private int crawlDomain(EdgeUrl rootUrl,
+                            SimpleRobotRules robotsRules,
+                            CrawlDelayTimer delayTimer,
+                            DomainLinks domainLinks) {
+
+
        // Add external links to the crawl frontier
        crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));

-
        // Fetch sitemaps
        for (var sitemap : robotsRules.getSitemaps()) {
            crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
@@ -379,8 +381,10 @@ public class CrawlerRetreiver implements AutoCloseable {
                if (docOpt.isPresent()) {
                    var doc = docOpt.get();

-                    crawlFrontier.enqueueLinksFromDocument(top, doc);
-                    crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
+                    var responseUrl = new EdgeUrl(ok.uri());
+
+                    crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
+                    crawlFrontier.addVisited(responseUrl);
                }
            }
            else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -40,18 +40,12 @@ public class CrawlerRevisitor {
        int errors = 0;
        int skipped = 0;

-        for (;;) {
+        for (CrawledDocument doc : oldCrawlData) {
            if (errors > 20) {
                // If we've had too many errors, we'll stop trying to recrawl
                break;
            }

-            CrawledDocument doc = oldCrawlData.nextDocument();
-
-            if (doc == null)
-                break;
-
-            // This Shouldn't Happen (TM)
            var urlMaybe = EdgeUrl.parse(doc.url);
            if (urlMaybe.isEmpty())
                continue;
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/CrawledDomainReader.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/CrawledDomainReader.java
@@ -1,41 +0,0 @@
-package nu.marginalia.io;
-
-import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
-import nu.marginalia.io.crawldata.format.SlopSerializableCrawlDataStream;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.nio.file.Path;
-
-public class CrawledDomainReader {
-    private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
-
-    /** An iterator-like access to domain data  This must be closed otherwise it will leak off-heap memory! */
-    public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
-    {
-
-        String fileName = fullPath.getFileName().toString();
-        if (fileName.endsWith(".parquet")) {
-            try {
-                return new ParquetSerializableCrawlDataStream(fullPath);
-            } catch (Exception ex) {
-                logger.error("Error reading domain data from " + fullPath, ex);
-                return SerializableCrawlDataStream.empty();
-            }
-        }
-
-        if (fileName.endsWith(".slop.zip")) {
-            try {
-                return new SlopSerializableCrawlDataStream(fullPath);
-            } catch (Exception ex) {
-                logger.error("Error reading domain data from " + fullPath, ex);
-                return SerializableCrawlDataStream.empty();
-            }
-        }
-
-        logger.error("Unknown file type: {}", fullPath);
-        return SerializableCrawlDataStream.empty();
-    }
-
-}
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
@@ -1,5 +1,7 @@
 package nu.marginalia.io;

+import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
+import nu.marginalia.io.crawldata.format.SlopSerializableCrawlDataStream;
 import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
@@ -18,7 +20,6 @@ import java.util.function.Function;
 /** Closable iterator exceptional over serialized crawl data
 * The data may appear in any order, and the iterator must be closed.
 *
- * @see CrawledDomainReader
 * */
 public interface SerializableCrawlDataStream extends AutoCloseable {
    Logger logger = LoggerFactory.getLogger(SerializableCrawlDataStream.class);
@@ -27,13 +28,60 @@ public interface SerializableCrawlDataStream extends AutoCloseable {

    /** Return a size hint for the stream.  0 is returned if the hint is not available,
     * or if the file is seemed too small to bother */
-    default int sizeHint() { return 0; }
+    default int getSizeHint() { return 0; }

    boolean hasNext() throws IOException;

    @Nullable
    default Path path() { return null; }

+    void close() throws IOException;
+
+    /** An iterator-like access to domain data  This must be closed otherwise it will leak off-heap memory! */
+    static SerializableCrawlDataStream openDataStream(Path fullPath) throws IOException
+    {
+
+        String fileName = fullPath.getFileName().toString();
+
+        if (fileName.endsWith(".slop.zip")) {
+            try {
+                return new SlopSerializableCrawlDataStream(fullPath);
+            } catch (Exception ex) {
+                logger.error("Error reading domain data from " + fullPath, ex);
+                return SerializableCrawlDataStream.empty();
+            }
+        }
+
+        else if (fileName.endsWith(".parquet")) {
+            logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
+            try {
+                return new ParquetSerializableCrawlDataStream(fullPath);
+            } catch (Exception ex) {
+                logger.error("Error reading domain data from " + fullPath, ex);
+                return SerializableCrawlDataStream.empty();
+            }
+        }
+
+        logger.error("Unknown file type: {}", fullPath);
+        return SerializableCrawlDataStream.empty();
+    }
+
+    /** Get an idication of the size of the stream.  This is used to determine whether to
+     * load the stream into memory or not.  0 is returned if the hint is not available,
+     * or if the file is seemed too small to bother */
+    static int getSizeHint(Path fullPath) {
+        String fileName = fullPath.getFileName().toString();
+        if (fileName.endsWith(".parquet")) {
+            return ParquetSerializableCrawlDataStream.sizeHint(fullPath);
+        }
+        else if (fileName.endsWith(".slop.zip")) {
+            return SlopSerializableCrawlDataStream.sizeHint(fullPath);
+        }
+        else {
+            return 0;
+        }
+    }
+
    default <T>  Iterator<T> map(Function<SerializableCrawlData, Optional<T>> mapper) {
        return new Iterator<>() {
            T next = null;
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java
@@ -40,7 +40,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
        return path;
    }

-    public int sizeHint() {
+    public static int sizeHint(Path path) {
        // Only calculate size hint for large files
        // (the reason we calculate them in the first place is to assess whether it is large
        // because it has many documents, or because it is a small number of large documents)
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
@@ -52,7 +52,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
        return path;
    }

-    public int sizeHint() {
+    public static int sizeHint(Path path) {
        // Only calculate size hint for large files
        // (the reason we calculate them in the first place is to assess whether it is large
        // because it has many documents, or because it is a small number of large documents)
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
@@ -12,8 +12,7 @@ import java.io.InputStream;
 import java.net.InetAddress;
 import java.net.URI;
 import java.net.http.HttpHeaders;
-import java.util.Arrays;
-import java.util.Optional;
+import java.util.*;

 /* FIXME:  This interface has a very unfortunate name that is not very descriptive.
 */
@@ -65,7 +64,21 @@ public sealed interface HttpFetchResult {
    ) implements HttpFetchResult {

        public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
-            this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
+            this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
+        }
+
+        private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
+            Map<String, List<String>> inputMap = messageHeaders.map();
+            Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
+
+            inputMap.forEach((k, v) -> {
+                if (k.isBlank()) return;
+                if (!Character.isAlphabetic(k.charAt(0))) return;
+
+                filteredMap.put(k, v);
+            });
+
+            return HttpHeaders.of(filteredMap, (k,v) -> true);
        }

        public boolean isOk() {
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
@@ -59,9 +59,12 @@ public final class CrawledDocument implements SerializableCrawlData {
    }

    public Document parseBody() throws IOException {
+        // Prevent stalls from parsing excessively large documents
+
        return DocumentBodyToString.getParsedData(
                ContentType.parse(contentType),
                documentBodyBytes,
+                200_000,
                url);
    }

--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -108,8 +108,10 @@ public record SlopCrawlDataRecord(String domain,
    public static void convertFromParquet(Path parquetInput, Path slopOutput) throws IOException {
        Path tempDir = Files.createTempDirectory(slopOutput.getParent(), "conversion");

-        try (var writer = new Writer(tempDir)) {
-            CrawledDocumentParquetRecordFileReader.stream(parquetInput).forEach(
+        try (var writer = new Writer(tempDir);
+             var stream = CrawledDocumentParquetRecordFileReader.stream(parquetInput))
+        {
+            stream.forEach(
                parquetRecord -> {
                    try {
                        writer.write(new SlopCrawlDataRecord(parquetRecord));
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -10,7 +10,6 @@ import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.*;
-import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
@@ -227,7 +226,7 @@ class CrawlerRetreiverTest {

        convertToParquet(tempFileWarc1, tempFileParquet1);

-        try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
+        try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
            while (stream.hasNext()) {
                if (stream.next() instanceof CrawledDocument doc) {
                    data.add(doc);
@@ -280,7 +279,7 @@ class CrawlerRetreiverTest {

        convertToParquet(tempFileWarc1, tempFileParquet1);

-        try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
+        try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
            while (stream.hasNext()) {
                if (stream.next() instanceof CrawledDocument doc) {
                    data.add(doc);
@@ -329,7 +328,7 @@ class CrawlerRetreiverTest {
        doCrawl(tempFileWarc1, specs);
        convertToParquet(tempFileWarc1, tempFileParquet1);

-        try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
+        try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
            while (stream.hasNext()) {
                if (stream.next() instanceof CrawledDocument doc) {
                    data.add(doc);
@@ -376,7 +375,7 @@ class CrawlerRetreiverTest {
        doCrawl(tempFileWarc1, specs);
        convertToParquet(tempFileWarc1, tempFileParquet1);
        doCrawlWithReferenceStream(specs,
-                CrawledDomainReader.createDataStream(tempFileParquet1)
+                new CrawlDataReference(tempFileParquet1)
        );
        convertToParquet(tempFileWarc2, tempFileParquet2);

@@ -397,7 +396,7 @@ class CrawlerRetreiverTest {
            });
        }

-        try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
+        try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
            while (ds.hasNext()) {
                var doc = ds.next();
                if (doc instanceof CrawledDomain dr) {
@@ -439,7 +438,7 @@ class CrawlerRetreiverTest {

        convertToParquet(tempFileWarc1, tempFileParquet1);

-        try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
+        try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
            while (stream.hasNext()) {
                var doc = stream.next();
                data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
@@ -448,11 +447,9 @@ class CrawlerRetreiverTest {
            throw new RuntimeException(e);
        }

-        var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
-
        System.out.println("---");

-        doCrawlWithReferenceStream(specs, stream);
+        doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileParquet1));

        var revisitCrawlFrontier = new DomainCrawlFrontier(
                new EdgeDomain("www.marginalia.nu"),
@@ -488,7 +485,7 @@ class CrawlerRetreiverTest {
            });
        }

-        try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
+        try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
            while (ds.hasNext()) {
                var doc = ds.next();
                if (doc instanceof CrawledDomain dr) {
@@ -509,12 +506,11 @@ class CrawlerRetreiverTest {
        }
    }

-    private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
+    private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
        try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
             var db = new DomainStateDb(tempFileDb)
        ) {
-            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
-                    new CrawlDataReference(stream));
+            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
        }
        catch (IOException | SQLException ex) {
            Assertions.fail(ex);
--- a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java
+++ b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java
@@ -3,7 +3,6 @@ package nu.marginalia.extractor;
 import com.google.inject.Inject;
 import gnu.trove.set.hash.TLongHashSet;
 import nu.marginalia.hash.MurmurHash3_128;
-import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
@@ -59,7 +58,7 @@ public class AtagExporter implements ExporterIf {
                }

                Path crawlDataPath = inputDir.resolve(item.relPath());
-                try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
+                try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
                    exportLinks(tagWriter, stream);
                }
                catch (Exception ex) {
--- a/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java
+++ b/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java
@@ -1,7 +1,6 @@
 package nu.marginalia.extractor;

 import com.google.inject.Inject;
-import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.link_parser.FeedExtractor;
 import nu.marginalia.link_parser.LinkParser;
@@ -56,7 +55,7 @@ public class FeedExporter implements ExporterIf {
                }

                Path crawlDataPath = inputDir.resolve(item.relPath());
-                try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
+                try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
                    exportFeeds(tagWriter, stream);
                }
                catch (Exception ex) {
@@ -75,7 +74,7 @@ public class FeedExporter implements ExporterIf {
    private boolean exportFeeds(FeedCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
        FeedExtractor feedExtractor = new FeedExtractor(new LinkParser());

-        int size = stream.sizeHint();
+        int size = stream.getSizeHint();

        while (stream.hasNext()) {
            if (!(stream.next() instanceof CrawledDocument doc))
--- a/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java
+++ b/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java
@@ -5,7 +5,7 @@ import gnu.trove.map.hash.TLongIntHashMap;
 import gnu.trove.set.hash.TLongHashSet;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
-import nu.marginalia.io.CrawledDomainReader;
+import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.language.filter.LanguageFilter;
 import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.sentence.SentenceExtractor;
@@ -103,7 +103,7 @@ public class TermFrequencyExporter implements ExporterIf {
    {
        TLongHashSet words = new TLongHashSet(1000);

-        try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
+        try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
            while (stream.hasNext()) {
                if (Thread.interrupted())
                    return;
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java
@@ -228,7 +228,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
        }

        @Override
-        public boolean hasNext() throws IOException {
+        public boolean hasNext() {
            if (dataStack == null) {
                query();
            }
@@ -236,7 +236,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
        }

        @Override
-        public void close() throws Exception {
+        public void close() {
            dataStack.clear();
        }
    }
--- a/code/services-application/api-service/build.gradle
+++ b/code/services-application/api-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 java {
--- a/code/services-application/dating-service/build.gradle
+++ b/code/services-application/dating-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-application/explorer-service/build.gradle
+++ b/code/services-application/explorer-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-application/search-service-legacy/build.gradle
+++ b/code/services-application/search-service-legacy/build.gradle
@@ -5,7 +5,7 @@ plugins {
    id 'application'
    id 'jvm-test-suite'

-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchJsParameter.java
+++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchJsParameter.java
@@ -7,8 +7,7 @@ import java.util.Arrays;

 public enum SearchJsParameter {
    DEFAULT("default"),
-    DENY_JS("no-js", "js:true"),
-    REQUIRE_JS("yes-js", "js:false");
+    DENY_JS("no-js", "special:scripts");

    public final String value;
    public final String[] implictExcludeSearchTerms;
@@ -20,7 +19,6 @@ public enum SearchJsParameter {

    public static SearchJsParameter parse(@Nullable String value) {
        if (DENY_JS.value.equals(value)) return DENY_JS;
-        if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;

        return DEFAULT;
    }
--- a/code/services-application/search-service/build.gradle
+++ b/code/services-application/search-service/build.gradle
@@ -3,7 +3,7 @@ plugins {
    id 'application'
    id 'jvm-test-suite'
    id 'gg.jte.gradle' version '3.1.15'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
@@ -104,6 +104,8 @@ task compileTailwind {

    doLast {
        exec {
+            // If you're getting a build error like 'npm error could not determine executable to run'
+            // pointing you here, you need to run  `npm install -D tailwindcss`
            workingDir projectDir
            if (System.getProperty('os.name').toLowerCase().contains('windows')) {
                commandLine 'cmd', '/c', 'npx', 'tailwindcss',
--- a/code/services-application/search-service/java/nu/marginalia/search/SearchService.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/SearchService.java
@@ -3,8 +3,10 @@ package nu.marginalia.search;
 import com.google.inject.Inject;
 import io.jooby.Context;
 import io.jooby.Jooby;
+import io.jooby.StatusCode;
 import io.prometheus.client.Counter;
 import io.prometheus.client.Histogram;
+import nu.marginalia.WebsiteUrl;
 import nu.marginalia.search.svc.*;
 import nu.marginalia.service.discovery.property.ServicePartition;
 import nu.marginalia.service.server.BaseServiceParams;
@@ -16,6 +18,7 @@ import java.util.List;

 public class SearchService extends JoobyService {

+    private final WebsiteUrl websiteUrl;
    private final SearchSiteSubscriptionService siteSubscriptionService;

    private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
@@ -33,6 +36,7 @@ public class SearchService extends JoobyService {

    @Inject
    public SearchService(BaseServiceParams params,
+                         WebsiteUrl websiteUrl,
                         SearchFrontPageService frontPageService,
                         SearchAddToCrawlQueueService addToCrawlQueueService,
                         SearchSiteSubscriptionService siteSubscriptionService,
@@ -51,6 +55,7 @@ public class SearchService extends JoobyService {
                        new SearchAddToCrawlQueueService_(addToCrawlQueueService),
                        new SearchBrowseService_(searchBrowseService)
                ));
+        this.websiteUrl = websiteUrl;

        this.siteSubscriptionService = siteSubscriptionService;
    }
@@ -62,6 +67,10 @@ public class SearchService extends JoobyService {
        final String startTimeAttribute = "start-time";

        jooby.get("/export-opml", siteSubscriptionService::exportOpml);
+
+        jooby.get("/site/https://*", this::handleSiteUrlRedirect);
+        jooby.get("/site/http://*", this::handleSiteUrlRedirect);
+
        jooby.before((Context ctx) -> {
            ctx.setAttribute(startTimeAttribute, System.nanoTime());
        });
@@ -80,5 +89,19 @@ public class SearchService extends JoobyService {
        });
    }

+    /** Redirect handler for the case when the user passes
+     * an url like /site/https://example.com/, in this
+     * scenario we want to extract the domain name and redirect
+     * to /site/example.com/
+     */
+    private Context handleSiteUrlRedirect(Context ctx) {
+        var pv = ctx.path("*").value();
+        int trailSlash = pv.indexOf('/');
+        if (trailSlash > 0) {
+            pv = pv.substring(0, trailSlash);
+        }
+        ctx.sendRedirect(StatusCode.TEMPORARY_REDIRECT, websiteUrl.withPath("site/" + pv));
+        return ctx;
+    }

 }
--- a/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java
@@ -7,9 +7,7 @@ import java.util.Arrays;

 public enum SearchJsParameter {
    DEFAULT("default"),
-    DENY_JS("no-js", "js:true"),
-    REQUIRE_JS("yes-js", "js:false");
-
+    DENY_JS("no-js", "special:scripts");
    public final String value;
    public final String[] implictExcludeSearchTerms;

@@ -20,7 +18,6 @@ public enum SearchJsParameter {

    public static SearchJsParameter parse(@Nullable String value) {
        if (DENY_JS.value.equals(value)) return DENY_JS;
-        if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;

        return DEFAULT;
    }
--- a/code/services-application/search-service/java/nu/marginalia/search/command/SearchParameters.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/command/SearchParameters.java
@@ -86,8 +86,10 @@ public record SearchParameters(WebsiteUrl url,
    public String renderUrl() {

        StringBuilder pathBuilder = new StringBuilder("/search?");
-        pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));

+        if (query != null) {
+            pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
+        }
        if (profile != SearchProfile.NO_FILTER) {
            pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
        }
--- a/code/services-application/search-service/java/nu/marginalia/search/model/DecoratedSearchResults.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/model/DecoratedSearchResults.java
@@ -67,6 +67,10 @@ public class DecoratedSearchResults {
        return focusDomainId >= 0;
    }

+    public boolean isEmpty() {
+        return results.isEmpty();
+    }
+
    public SearchFilters getFilters() {
        return filters;
    }
--- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryService.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryService.java
@@ -56,7 +56,9 @@ public class SearchQueryService {
        }
        catch (Exception ex) {
            logger.error("Error", ex);
-            return errorPageService.serveError(SearchParameters.defaultsForQuery(websiteUrl, query, page));
+            return errorPageService.serveError(
+                    SearchParameters.defaultsForQuery(websiteUrl, query, Objects.requireNonNullElse(page, 1))
+            );
        }
    }

--- a/code/services-application/search-service/resources/jte/part/footerLegal.jte
+++ b/code/services-application/search-service/resources/jte/part/footerLegal.jte
@@ -9,7 +9,7 @@
        <span>
        Access logs containing IP-addresses are retained for up to 24 hours,
        anonymized logs with source addresses removed are sometimes kept longer
-        for to help diagnosing bugs.
+        to help diagnose bugs.
        </span>
    </div>
    <div class="flex space-y-4 flex-col">
--- a/code/services-application/search-service/resources/jte/serp/main.jte
+++ b/code/services-application/search-service/resources/jte/serp/main.jte
@@ -44,6 +44,11 @@
                <div class="grow"></div>
                <a href="${results.getParams().renderUrlWithoutSiteFocus()}" class="fa fa-remove"></a>
            </div>
+        @elseif (results.isEmpty())
+            <div class="border dark:border-gray-600 rounded flex space-x-4 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-100 text-sm p-4 items-center">
+                No search results found.  Try different search terms, or spelling variations.  The search engine currently
+                only supports queries in the English language.
+            </div>
        @endif

        <div class="space-y-4 sm:space-y-6">
--- a/code/services-application/search-service/resources/jte/serp/part/result.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/result.jte
@@ -86,7 +86,7 @@
            @endif

            @if(result.getFirst().isTracking())
-                <span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Track</span>
+                <span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Tracking</span>
            @endif

            @if(result.getFirst().isScripts())
@@ -94,11 +94,11 @@
            @endif

            @if(result.getFirst().isAds())
-                <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains adtech">Ads</span>
+                <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains adtech">Has Ads</span>
            @endif

            @if(result.getFirst().isAffiliate())
-                <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Affiliate</span>
+                <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Has Affiliate</span>
            @endif

        </span>
--- a/code/services-application/search-service/resources/jte/siteinfo/view/docs.jte
+++ b/code/services-application/search-service/resources/jte/siteinfo/view/docs.jte
@@ -53,7 +53,7 @@
        @endif

        @if(details.isTracking())
-            <span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Track</span>
+            <span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Tracking</span>
        @endif

        @if(details.isScripts())
@@ -65,7 +65,7 @@
        @endif

        @if(details.isAffiliate())
-            <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Affiliate</span>
+            <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Has Affiliate</span>
        @endif

    </div>
--- a/code/services-application/status-service/build.gradle
+++ b/code/services-application/status-service/build.gradle
@@ -2,7 +2,7 @@ plugins {
    id 'java'
    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 java {
--- a/code/services-core/assistant-service/build.gradle
+++ b/code/services-core/assistant-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
@@ -23,7 +23,12 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
 apply from: "$rootProject.projectDir/docker.gradle"

 dependencies {
-    implementation project(':third-party:symspell')
+
+
+    implementation project(':code:common:db')
+    implementation project(':code:common:model')
+    implementation project(':code:common:service')
+    implementation project(':code:common:config')

    implementation project(':code:functions:live-capture')
    implementation project(':code:functions:live-capture:api')
@@ -32,20 +37,16 @@ dependencies {
    implementation project(':code:functions:domain-info')
    implementation project(':code:functions:domain-info:api')

-    implementation project(':code:common:config')
-    implementation project(':code:common:service')
-    implementation project(':code:common:model')
-    implementation project(':code:common:db')
-
-    implementation project(':code:features-search:screenshots')
-
    implementation project(':code:libraries:geo-ip')
    implementation project(':code:libraries:language-processing')
    implementation project(':code:libraries:term-frequency-dict')

-    implementation libs.bundles.slf4j
+    implementation project(':third-party:symspell')

+
+    implementation libs.bundles.slf4j
    implementation libs.prometheus
+    implementation libs.commons.io
    implementation libs.guava
    libs.bundles.grpc.get().each {
        implementation dependencies.create(it) {
@@ -59,9 +60,7 @@ dependencies {
    implementation dependencies.create(libs.guice.get()) {
        exclude group: 'com.google.guava'
    }
-    implementation dependencies.create(libs.spark.get()) {
-        exclude group: 'org.eclipse.jetty'
-    }
+    implementation libs.bundles.jooby
    implementation libs.bundles.jetty
    implementation libs.opencsv
    implementation libs.trove
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java
@@ -3,6 +3,8 @@ package nu.marginalia.assistant;
 import com.google.inject.Guice;
 import com.google.inject.Inject;
 import com.google.inject.Injector;
+import io.jooby.ExecutionMode;
+import io.jooby.Jooby;
 import nu.marginalia.livecapture.LivecaptureModule;
 import nu.marginalia.service.MainClass;
 import nu.marginalia.service.ServiceId;
@@ -38,8 +40,17 @@ public class AssistantMain extends MainClass {
        var configuration = injector.getInstance(ServiceConfiguration.class);
        orchestrateBoot(registry, configuration);

-        injector.getInstance(AssistantMain.class);
+        var main = injector.getInstance(AssistantMain.class);
        injector.getInstance(Initialization.class).setReady();

+        Jooby.runApp(new String[] { "application.env=prod" }, ExecutionMode.WORKER, () -> new Jooby() {
+            {
+                main.start(this);
+            }
+        });
+    }
+
+    public void start(Jooby jooby) {
+        service.startJooby(jooby);
    }
 }
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantService.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantService.java
@@ -2,27 +2,27 @@ package nu.marginalia.assistant;

 import com.google.gson.Gson;
 import com.google.inject.Inject;
+import io.jooby.Context;
+import io.jooby.Jooby;
 import nu.marginalia.assistant.suggest.Suggestions;
 import nu.marginalia.functions.domains.DomainInfoGrpcService;
 import nu.marginalia.functions.math.MathGrpcService;
 import nu.marginalia.livecapture.LiveCaptureGrpcService;
 import nu.marginalia.model.gson.GsonFactory;
 import nu.marginalia.rss.svc.FeedsGrpcService;
-import nu.marginalia.screenshot.ScreenshotService;
 import nu.marginalia.service.discovery.property.ServicePartition;
 import nu.marginalia.service.server.BaseServiceParams;
-import nu.marginalia.service.server.SparkService;
+import nu.marginalia.service.server.JoobyService;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import spark.Request;
-import spark.Response;
-import spark.Spark;

 import java.util.List;

-public class AssistantService extends SparkService {
+public class AssistantService extends JoobyService {
    private final Logger logger = LoggerFactory.getLogger(getClass());
    private final Gson gson = GsonFactory.get();
+    @org.jetbrains.annotations.NotNull
+    private final ScreenshotService screenshotService;
    private final Suggestions suggestions;

    @Inject
@@ -39,30 +39,30 @@ public class AssistantService extends SparkService {
                List.of(domainInfoGrpcService,
                        mathGrpcService,
                        liveCaptureGrpcService,
-                        feedsGrpcService));
+                        feedsGrpcService),
+                List.of());
+        this.screenshotService = screenshotService;

        this.suggestions = suggestions;

-        Spark.staticFiles.expireTime(600);
-
-        Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
-        Spark.get("/suggest/", this::getSuggestions, this::convertToJson);
-
-        Spark.awaitInitialization();
    }

-    private Object getSuggestions(Request request, Response response) {
-        response.type("application/json");
-        var param = request.queryParams("partial");
-        if (param == null) {
+    public void startJooby(Jooby jooby) {
+        super.startJooby(jooby);
+
+        jooby.get("/suggest/", this::getSuggestions);
+        jooby.get("/screenshot/{id}", screenshotService::serveScreenshotRequest);
+    }
+
+    private String getSuggestions(Context context) {
+        context.setResponseType("application/json");
+        var param = context.query("partial");
+        if (param.isMissing()) {
            logger.warn("Bad parameter, partial is null");
-            Spark.halt(500);
+            context.setResponseCode(500);
+            return "{}";
        }
-        return suggestions.getSuggestions(10, param);
-    }
-
-    private String convertToJson(Object o) {
-        return gson.toJson(o);
+        return gson.toJson(suggestions.getSuggestions(10, param.value()));
    }

 }
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/ScreenshotService.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/ScreenshotService.java
@@ -0,0 +1,118 @@
+package nu.marginalia.assistant;
+
+import com.google.common.base.Strings;
+import com.google.inject.Inject;
+import com.zaxxer.hikari.HikariDataSource;
+import io.jooby.Context;
+import nu.marginalia.db.DbDomainQueries;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.sql.SQLException;
+
+public class ScreenshotService {
+
+    private final DbDomainQueries domainQueries;
+    private final HikariDataSource dataSource;
+
+    private final Logger logger = LoggerFactory.getLogger(getClass());
+
+    @Inject
+    public ScreenshotService(DbDomainQueries dbDomainQueries, HikariDataSource dataSource) {
+        this.domainQueries = dbDomainQueries;
+        this.dataSource = dataSource;
+    }
+
+    public boolean hasScreenshot(int domainId) {
+        try (var conn = dataSource.getConnection();
+             var ps = conn.prepareStatement("""
+                    SELECT TRUE
+                        FROM DATA_DOMAIN_SCREENSHOT
+                        INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
+                        WHERE EC_DOMAIN.ID=?
+                    """)) {
+            ps.setInt(1, domainId);
+            var rs = ps.executeQuery();
+            if (rs.next()) {
+                return rs.getBoolean(1);
+            }
+        }
+        catch (SQLException ex) {
+            logger.warn("SQL error", ex);
+        }
+        return false;
+    }
+
+    public Object serveScreenshotRequest(Context context) {
+        if (Strings.isNullOrEmpty(context.path("id").value(""))) {
+            context.setResponseCode(404);
+            return "";
+        }
+
+        int id = context.path("id").intValue();
+
+        try (var conn = dataSource.getConnection();
+             var ps = conn.prepareStatement("""
+                    SELECT CONTENT_TYPE, DATA
+                        FROM DATA_DOMAIN_SCREENSHOT
+                        INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
+                        WHERE EC_DOMAIN.ID=?
+                    """)) {
+            ps.setInt(1, id);
+            var rsp = ps.executeQuery();
+            if (rsp.next()) {
+                context.setResponseType(rsp.getString(1));
+                context.setResponseCode(200);
+                context.setResponseHeader("Cache-control", "public,max-age=3600");
+
+                try (var rs = context.responseStream()) {
+                    IOUtils.copy(rsp.getBlob(2).getBinaryStream(), rs);
+                }
+                return "";
+            }
+        }
+        catch (IOException ex) {
+            logger.warn("IO error", ex);
+        }
+        catch (SQLException ex) {
+            logger.warn("SQL error", ex);
+        }
+
+        context.setResponseType("image/svg+xml");
+
+        var name = domainQueries.getDomain(id).map(Object::toString)
+                .orElse("[Screenshot Not Yet Captured]");
+
+        return """
+                <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+                <svg
+                   xmlns="http://www.w3.org/2000/svg"
+                   width="640px"
+                   height="480px"
+                   viewBox="0 0 640 480"
+                   version="1.1">
+                  <g>
+                    <rect
+                       style="fill:#808080"
+                       id="rect288"
+                       width="595.41992"
+                       height="430.01825"
+                       x="23.034981"
+                       y="27.850344" />
+                    <text
+                       xml:space="preserve"
+                      style="font-size:100px;fill:#909090;font-family:sans-serif;"
+                       x="20"
+                       y="120">Placeholder</text>
+                    <text
+                       xml:space="preserve"
+                       style="font-size:32px;fill:#000000;font-family:monospace;"
+                       x="320" y="240" dominant-baseline="middle" text-anchor="middle">%s</text>
+                  </g>
+                </svg>
+                """.formatted(name);
+    }
+
+}
--- a/code/services-core/control-service/build.gradle
+++ b/code/services-core/control-service/build.gradle
@@ -2,7 +2,7 @@ plugins {
    id 'java'
    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 java {
--- a/code/services-core/executor-service/build.gradle
+++ b/code/services-core/executor-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-core/index-service/build.gradle
+++ b/code/services-core/index-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-core/query-service/build.gradle
+++ b/code/services-core/query-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java
+++ b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java
@@ -3,7 +3,7 @@ package nu.marginalia.tools;
 import com.google.inject.Guice;
 import com.google.inject.Injector;
 import nu.marginalia.converting.ConverterModule;
-import nu.marginalia.io.CrawledDomainReader;
+import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.service.module.DatabaseModule;

@@ -40,7 +40,7 @@ public class ExperimentRunnerMain {
        Path basePath = Path.of(args[0]);
        for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
            Path crawlDataPath = basePath.resolve(item.relPath());
-            try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
+            try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
                experiment.process(stream);
            }
            catch (Exception ex) {
--- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java
+++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java
@@ -26,7 +26,7 @@ import nu.marginalia.index.index.StatefulIndex;
 import nu.marginalia.index.journal.IndexJournal;
 import nu.marginalia.index.model.SearchParameters;
 import nu.marginalia.index.searchset.SearchSetAny;
-import nu.marginalia.io.CrawledDomainReader;
+import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.linkdb.docs.DocumentDbReader;
 import nu.marginalia.linkdb.docs.DocumentDbWriter;
 import nu.marginalia.loading.LoaderIndexJournalWriter;
@@ -152,7 +152,7 @@ public class IntegrationTest {

        /** PROCESS CRAWL DATA */

-        var processedDomain = domainProcessor.fullProcessing(CrawledDomainReader.createDataStream(crawlDataParquet));
+        var processedDomain = domainProcessor.fullProcessing(SerializableCrawlDataStream.openDataStream(crawlDataParquet));

        System.out.println(processedDomain);

--- a/code/tools/screenshot-capture-tool/build.gradle
+++ b/code/tools/screenshot-capture-tool/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 java {
--- a/run/readme.md
+++ b/run/readme.md
@@ -16,8 +16,6 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
 The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
 graalce is a good distribution choice but it doesn't matter too much.

-**Tailwindcss** - Install NPM and run `npm install -D tailwindcss`
-
 ## Quick Set up

 [https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
--- a/run/setup.sh
+++ b/run/setup.sh
@@ -74,3 +74,7 @@ download_model model/tfreq-new-algo3.bin https://huggingface.co/MarginaliaNu/Mar
 download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569

 popd
+
+pushd $(dirname $0)/..
+npm install -D tailwindcss@3
+popd
--- a/settings.gradle
+++ b/settings.gradle
@@ -160,12 +160,12 @@ dependencyResolutionManagement {
            library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0')
            library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0')

-            library('slf4j.api', 'org.slf4j', 'slf4j-api').version('1.7.36')
+            library('slf4j.api', 'org.slf4j', 'slf4j-api').version('2.0.3')
            library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3')

-            library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.17.2')
-            library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.17.2')
-            library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j-impl').version('2.17.2')
+            library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.24.3')
+            library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.24.3')
+            library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j2-impl').version('2.24.3')

            library('notnull','org.jetbrains','annotations').version('24.0.0')

@@ -234,11 +234,12 @@ dependencyResolutionManagement {
            library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
            library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')

-            library('slop', 'nu.marginalia', 'slop').version('0.0.9-org-5-SNAPSHOT')
+            library('slop', 'nu.marginalia', 'slop').version('0.0.10-SNAPSHOT')
            library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
            library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
            library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)

+            library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
            library('jte','gg.jte','jte').version('3.1.15')

            bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
Author	SHA1	Message	Date
Viktor Lofgren	185bf28fca	(crawler) Correct issue leading to parquet files not being correctly preconverted Path.endsWith("str") != String.endsWith(".str")	2025-03-10 13:48:12 +01:00
Viktor Lofgren	78cc25584a	(crawler) Add error logging when entering bad path for historical crawl data	2025-03-10 13:38:40 +01:00
Viktor Lofgren	62ba30bacf	(common) Log info about metrics server	2025-03-10 13:12:39 +01:00
Viktor Lofgren	3bb84eb206	(common) Log info about metrics server	2025-03-10 13:03:48 +01:00
Viktor Lofgren	be7d13ccce	(crawler) Correct task execution logic in crawler The old behavior would flag domains as pending too soon, leading to them being omitted from execution if they were not immediately available to run.	2025-03-09 13:47:51 +01:00
Viktor Lofgren	8c088a7c0b	(crawler) Remove custom thread factory This was causing issues, and not really doing much of benefit.	2025-03-09 11:50:52 +01:00
Viktor Lofgren	ea9a642b9b	(crawler) More effective task scheduling in the crawler This should hopefully allow more threads to be busy	2025-03-09 11:44:59 +01:00
Viktor Lofgren	27f528af6a	(search) Fix "Remove Javascript" toggle A bug was introduced at some point where the special keyword for filtering on javascript was changed to special:scripts, from js:true/js:false. Solves issue #155	2025-02-28 12:03:04 +01:00
Viktor Lofgren	20ca41ec95	(processed model) Use String columns instead of Txt columns for SlopDocumentRecord It's very likely TxtStringColumn is the culprit of the bug seen in https://github.com/MarginaliaSearch/MarginaliaSearch/issues/154 where the wrong URL was shown for a search result.	2025-02-24 11:41:51 +01:00
Viktor Lofgren	7671f0d9e4	(search) Display message when no search results are found	2025-02-24 11:15:55 +01:00
Viktor Lofgren	44d6bc71b7	(assistant) Migrate to Jooby framework	2025-02-15 13:28:12 +01:00
Viktor Lofgren	9d302e2973	(assistant) Migrate to Jooby framework	2025-02-15 13:26:04 +01:00
Viktor Lofgren	f553701224	(assistant) Migrate to Jooby framework	2025-02-15 13:21:48 +01:00
Viktor Lofgren	f076d05595	(deps) Upgrade slf4j to latest	2025-02-15 12:50:16 +01:00
Viktor Lofgren	b513809710	(*) Stopgap fix for metrics server initialization errors bringing down services	2025-02-14 17:09:48 +01:00
Viktor Lofgren	7519b28e21	(search) Correct exception from misbehaving bots feeding invalid urls	2025-02-14 17:05:24 +01:00
Viktor Lofgren	3eac4dd57f	(search) Correct exception in error handler when page is missing	2025-02-14 17:00:21 +01:00
Viktor Lofgren	4c2810720a	(search) Add redirect handler for full URLs in the /site endpoint	2025-02-14 16:31:11 +01:00
Viktor Lofgren	8480ba8daa	(live-capture) Code cleanup	2025-02-04 14:05:36 +01:00
Viktor Lofgren	fbba392491	(live-capture) Send a UA-string from the browserless fetcher as well The change also introduces a somewhat convoluted wiremock test to intercept and verify that these headers are in fact sent	2025-02-04 13:36:49 +01:00
Viktor Lofgren	530eb35949	(update-rss) Do not fail the feed fetcher control actor if it takes a long time to complete.	2025-02-03 11:35:32 +01:00
Viktor Lofgren	c2dd2175a2	(search) Add new query expansion rule contracting WORD NUM pairs into WORD-NUM and WORDNUM	2025-02-01 13:13:30 +01:00
Viktor Lofgren	b8581b0f56	(crawler) Safe sanitization of headers during warc->slop conversion The warc->slop converter was rejecting some items because they had headers that were representable in the Warc code's MessageHeader map implementation, but illegal in the HttpHeaders' implementation. Fixing this by manually filtering these out. Ostensibly the constructor has a filtering predicate, but this annoyingly runs too late and fails to prevent the problem.	2025-01-31 12:47:42 +01:00
Viktor Lofgren	2ea34767d8	(crawler) Use the response URL when resolving relative links The crawler was incorrectly using the request URL as the base URL when resolving relative links. This caused problems when encountering redirects. For example if we fetch /log, redirecting to /log/ and find links to foo/, and bar/; these would resolve to /foo and /bar, and not /log/foo and /log/bar.	2025-01-31 12:40:13 +01:00
Viktor Lofgren	e9af838231	(actor) Fix migration actor final steps	2025-01-30 11:48:21 +01:00
Viktor Lofgren	ae0cad47c4	(actor) Utility method for getting a json prototype for actor states If we can hook this into the control gui somehow, it'll make for a nice QOL upgrade when manually interacting with the actors.	2025-01-29 15:20:25 +01:00
Viktor Lofgren	5fbc8ef998	(misc) Tidying	2025-01-29 15:17:04 +01:00
Viktor Lofgren	32c6dd9e6a	(actor) Delete old data in the migration actor	2025-01-29 14:51:46 +01:00
Viktor Lofgren	6ece6a6cfb	(actor) Improve resilience for the migration actor	2025-01-29 14:43:09 +01:00
Viktor Lofgren	39cd1c18f8	Automatically run npm install tailwindcss@3 via setup.sh, as the new default version of the package is incompatible with the project	2025-01-29 12:21:08 +01:00
Viktor	eb65daaa88	Merge pull request #151 from Lionstiger/master fix small grammar error in footerLegal.jte	2025-01-28 21:49:50 +01:00
Viktor	0bebdb6e33	Merge branch 'master' into master	2025-01-28 21:49:36 +01:00
Viktor Lofgren	1e50e392c6	(actor) Improve logging and error handling for data migration actor	2025-01-28 15:34:36 +01:00
Viktor Lofgren	fb673de370	(crawler) Change the header 'User-agent' to 'User-Agent'	2025-01-28 15:34:16 +01:00
Viktor Lofgren	eee73ab16c	(crawler) Be more lenient when performing a domain probe	2025-01-28 15:24:30 +01:00
Viktor Lofgren	5354e034bf	(search) Minor grammar fix	2025-01-27 18:36:31 +01:00
Magnus Wulf	72384ad6ca	fix small grammar error	2025-01-27 15:04:57 +01:00
Viktor Lofgren	a2b076f9be	(converter) Add progress tracking for big domains in converter	2025-01-26 18:03:59 +01:00
Viktor Lofgren	c8b0a32c0f	(crawler) Reduce long retention of CrawlDataReference objects and their associated SerializableCrawlDataStreams	2025-01-26 15:40:17 +01:00
Viktor Lofgren	f0d74aa3bb	(converter) Fix close() ordering to prevent converter crash	2025-01-26 14:47:36 +01:00
Viktor Lofgren	74a1f100f4	(converter) Refactor to remove CrawledDomainReader and move its functionality into SerializableCrawlDataStream	2025-01-26 14:46:50 +01:00
Viktor Lofgren	eb049658e4	(converter) Add truncation att the parser step to prevent the converter from spending too much time on excessively large documents Refactor to do this without introducing additional copies	2025-01-26 14:28:53 +01:00
Viktor Lofgren	db138b2a6f	(converter) Add truncation att the parser step to prevent the converter from spending too much time on exessively large documents	2025-01-26 14:25:57 +01:00
Viktor Lofgren	1673fc284c	(converter) Reduce lock contention in converter by separating the processing of full and simple-track domains	2025-01-26 13:21:46 +01:00
Viktor Lofgren	503ea57d5b	(converter) Reduce lock contention in converter by separating the processing of full and simple-track domains	2025-01-26 13:18:14 +01:00
Viktor Lofgren	18ca926c7f	(converter) Truncate excessively long strings in SentenceExtractor, malformed data was effectively DOS:ing the converter	2025-01-26 12:52:54 +01:00
Viktor Lofgren	db99242db2	(converter) Adding some logging around the simple processing track to investigate an issue with the converter stalling	2025-01-26 12:02:00 +01:00
Viktor Lofgren	2b9d2985ba	(doc) Update readme with up-to-date install instructions.	2025-01-24 18:51:41 +01:00
Viktor Lofgren	eeb6ecd711	(search) Make it clearer that the affiliate marker applies to the result, and not the search engine's relation to the result.	2025-01-24 18:50:00 +01:00
Viktor Lofgren	1f58aeadbf	(build) Upgrade JIB	2025-01-24 18:49:28 +01:00
Viktor Lofgren	3d68be64da	(crawler) Add default CT when it's missing for icons	2025-01-22 13:55:47 +01:00