Merge pull request #192 from MarginaliaSearch/improve-suggestions

Improve typeahead suggestions
(assistant) Improve typeahead suggestions
2025-10-06 07:32:38 +02:00 · 2025-04-23 20:17:49 +02:00 · 2025-04-23 20:13:53 +02:00 · 2025-04-22 16:03:42 +02:00 · 2025-04-22 15:52:41 +02:00 · 2025-04-22 15:51:03 +02:00
34 changed files with 1120 additions and 467 deletions
--- a/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
 import nu.marginalia.actor.state.ActorStep;
 import nu.marginalia.actor.state.Resume;
 import nu.marginalia.service.control.ServiceEventLog;
+import nu.marginalia.service.control.ServiceHeartbeat;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageId;
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.*;
+import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {

    private final FileStorageService storageService;
    private final ServiceEventLog eventLog;
+    private final ServiceHeartbeat heartbeat;
    private final Logger logger = LoggerFactory.getLogger(getClass());

    @Resume(behavior = ActorResumeBehavior.ERROR)
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {

                Files.deleteIfExists(Path.of(tarFileName));

-                try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
-                     var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
-                    is.transferTo(os);
+                HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
+
+                try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
+                    long size = urlConnection.getContentLengthLong();
+                    byte[] buffer = new byte[8192];
+
+                    try (var is = new BufferedInputStream(urlConnection.getInputStream());
+                         var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
+                        long copiedSize = 0;
+
+                        while (copiedSize < size) {
+                            int read = is.read(buffer);
+
+                            if (read < 0) // We've been promised a file of length 'size'
+                                throw new IOException("Unexpected end of stream");
+
+                            os.write(buffer, 0, read);
+                            copiedSize += read;
+
+                            // Update progress bar
+                            hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
+                        }
+                    }
+
                }
                catch (Exception ex) {
                    eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
                    logger.error("Error downloading sample", ex);
                    yield new Error();
                }
+                finally {
+                    urlConnection.disconnect();
+                }

                eventLog.logEvent(DownloadSampleActor.class, "Download complete");
                yield new Extract(fileStorageId, tarFileName);
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
    @Inject
    public DownloadSampleActor(Gson gson,
                               FileStorageService storageService,
-                               ServiceEventLog eventLog)
+                               ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
    {
        super(gson);
        this.storageService = storageService;
        this.eventLog = eventLog;
+        this.heartbeat = heartbeat;
    }

 }
--- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
@@ -20,7 +20,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
 import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
 import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
    private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
        List<SerializableCrawlData> data = new ArrayList<>();

-        try (var recorder = new WarcRecorder(fileName, new BasicCookieStore());
+        try (var recorder = new WarcRecorder(fileName);
             var db = new DomainStateDb(dbTempFile))
        {
            new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
 import java.security.Security;
 import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;

@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {

    private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();

+    private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
+
    private final AtomicInteger tasksDone = new AtomicInteger(0);
    private final HttpFetcherImpl fetcher;

@@ -277,12 +280,29 @@ public class CrawlerMain extends ProcessMainClass {
            }

             // Schedule viable tasks for execution until list is empty
-            while (!taskList.isEmpty()) {
-                taskList.removeIf(this::trySubmitDeferredTask);
+            for (int emptyRuns = 0;emptyRuns < 300;) {
+                boolean hasTasks = !taskList.isEmpty();

-                // Add a small pause here to avoid busy looping toward the end of the execution cycle when
-                // we might have no new viable tasks to run for hours on end
-                TimeUnit.MILLISECONDS.sleep(50);
+                // The order of these checks  very important to avoid a race condition
+                // where we miss a task that is put into the retry queue
+                boolean hasRunningTasks = pool.getActiveCount() > 0;
+                boolean hasRetryTasks = !retryQueue.isEmpty();
+
+                if (hasTasks || hasRetryTasks || hasRunningTasks) {
+                    retryQueue.drainTo(taskList);
+
+                    // Try to submit any tasks that are in the retry queue (this will block if the pool is full)
+                    taskList.removeIf(this::trySubmitDeferredTask);
+
+                    // Add a small pause here to avoid busy looping toward the end of the execution cycle when
+                    // we might have no new viable tasks to run for hours on end
+                    TimeUnit.MILLISECONDS.sleep(5);
+                } else {
+                    // We have no tasks to run, and no tasks in the retry queue
+                    // but we wait a bit to see if any new tasks come in via the retry queue
+                    emptyRuns++;
+                    TimeUnit.SECONDS.sleep(1);
+                }
            }

            logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -414,7 +434,7 @@ public class CrawlerMain extends ProcessMainClass {
        /** Best effort indicator whether we could start this now without getting stuck in
         * DomainLocks purgatory */
        public boolean canRun() {
-            return domainLocks.canLock(new EdgeDomain(domain));
+            return domainLocks.isLockableHint(new EdgeDomain(domain));
        }

        @Override
@@ -425,66 +445,82 @@ public class CrawlerMain extends ProcessMainClass {
                return;
            }

-            Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
-            Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
-            Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
-
-            // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
-            // while writing to the same file name as before
-            if (Files.exists(newWarcFile)) {
-                Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
-            }
-            else {
-                Files.deleteIfExists(tempFile);
-            }
-
-            try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
-                 var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
-                 CrawlDataReference reference = getReference()
-            )
-            {
-                // Resume the crawl if it was aborted
-                if (Files.exists(tempFile)) {
-                    retriever.syncAbortedRun(tempFile);
-                    Files.delete(tempFile);
+            Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
+            // We don't have a lock, so we can't run this task
+            // we return to avoid blocking the pool for too long
+            if (lock.isEmpty()) {
+                if (retryQueue.remainingCapacity() > 0) {
+                    // Sleep a moment to avoid busy looping via the retry queue
+                    // in the case when few tasks remain and almost all are ineligible for
+                    // immediate restart
+                    Thread.sleep(5);
                }

-                DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
+                retryQueue.put(this);
+                return;
+            }
+            DomainLocks.DomainLock domainLock = lock.get();

-                int size;
-                try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
-                    size = retriever.crawlDomain(domainLinks, reference);
+            try (domainLock) {
+                Thread.currentThread().setName("crawling:" + domain);
+
+                Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
+                Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
+                Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
+
+                // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
+                // while writing to the same file name as before
+                if (Files.exists(newWarcFile)) {
+                    Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
+                }
+                else {
+                    Files.deleteIfExists(tempFile);
                }

-                // Delete the reference crawl data if it's not the same as the new one
-                // (mostly a case when migrating from legacy->warc)
-                reference.delete();
+                try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
+                     var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
+                     CrawlDataReference reference = getReference())
+                {
+                    // Resume the crawl if it was aborted
+                    if (Files.exists(tempFile)) {
+                        retriever.syncAbortedRun(tempFile);
+                        Files.delete(tempFile);
+                    }

-                // Convert the WARC file to Parquet
-                SlopCrawlDataRecord
-                        .convertWarc(domain, userAgent, newWarcFile, slopFile);
+                    DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);

-                // Optionally archive the WARC file if full retention is enabled,
-                // otherwise delete it:
-                warcArchiver.consumeWarc(newWarcFile, domain);
+                    int size = retriever.crawlDomain(domainLinks, reference);

-                // Mark the domain as finished in the work log
-                workLog.setJobToFinished(domain, slopFile.toString(), size);
+                    // Delete the reference crawl data if it's not the same as the new one
+                    // (mostly a case when migrating from legacy->warc)
+                    reference.delete();

-                // Update the progress bar
-                heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
+                    // Convert the WARC file to Slop
+                    SlopCrawlDataRecord
+                            .convertWarc(domain, userAgent, newWarcFile, slopFile);

-                logger.info("Fetched {}", domain);
-            } catch (Exception e) {
-                logger.error("Error fetching domain " + domain, e);
-            }
-            finally {
-                // We don't need to double-count these; it's also kept in the workLog
-                pendingCrawlTasks.remove(domain);
-                Thread.currentThread().setName("[idle]");
+                    // Optionally archive the WARC file if full retention is enabled,
+                    // otherwise delete it:
+                    warcArchiver.consumeWarc(newWarcFile, domain);

-                Files.deleteIfExists(newWarcFile);
-                Files.deleteIfExists(tempFile);
+                    // Mark the domain as finished in the work log
+                    workLog.setJobToFinished(domain, slopFile.toString(), size);
+
+                    // Update the progress bar
+                    heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
+
+                    logger.info("Fetched {}", domain);
+                } catch (Exception e) {
+                    logger.error("Error fetching domain " + domain, e);
+                }
+                finally {
+                    // We don't need to double-count these; it's also kept in the workLog
+                    pendingCrawlTasks.remove(domain);
+                    Thread.currentThread().setName("[idle]");
+
+                    Files.deleteIfExists(newWarcFile);
+                    Files.deleteIfExists(tempFile);
+                }
            }
        }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
@@ -1,6 +1,6 @@
 package nu.marginalia.crawl.fetcher;

-import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
+import org.apache.hc.client5.http.classic.methods.HttpGet;

 /** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
 public record ContentTags(String etag, String lastMod) {
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
    }

    /** Paints the tags onto the request builder. */
-    public void paint(ClassicRequestBuilder getBuilder) {
+    public void paint(HttpGet request) {

        if (etag != null) {
-            getBuilder.addHeader("If-None-Match", etag);
+            request.addHeader("If-None-Match", etag);
        }

        if (lastMod != null) {
-            getBuilder.addHeader("If-Modified-Since", lastMod);
+            request.addHeader("If-Modified-Since", lastMod);
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java
@@ -1,34 +0,0 @@
-package nu.marginalia.crawl.fetcher;
-
-import java.io.IOException;
-import java.net.CookieHandler;
-import java.net.URI;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-
-public class Cookies extends CookieHandler {
-    final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
-
-    public void clear() {
-        cookieJar.get().clear();
-    }
-
-    public boolean hasCookies() {
-        return !cookieJar.get().isEmpty();
-    }
-
-    public List<String> getCookies() {
-        return cookieJar.get().values().stream().flatMap(List::stream).toList();
-    }
-
-    @Override
-    public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
-        return cookieJar.get();
-    }
-
-    @Override
-    public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
-        cookieJar.get().putAll(responseHeaders);
-    }
-}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/DomainCookies.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/DomainCookies.java
@@ -0,0 +1,56 @@
+package nu.marginalia.crawl.fetcher;
+
+import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
+import org.apache.hc.core5.http.ClassicHttpRequest;
+import org.apache.hc.core5.http.HttpResponse;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.StringJoiner;
+
+public class DomainCookies {
+    private final Map<String, String> cookies = new HashMap<>();
+
+    public boolean hasCookies() {
+        return !cookies.isEmpty();
+    }
+
+    public void updateCookieStore(HttpResponse response) {
+        for (var header : response.getHeaders()) {
+            if (header.getName().equalsIgnoreCase("Set-Cookie")) {
+                parseCookieHeader(header.getValue());
+            }
+        }
+    }
+
+    private void parseCookieHeader(String value) {
+        // Parse the Set-Cookie header value and extract the cookies
+
+        String[] parts = value.split(";");
+        String cookie = parts[0].trim();
+
+        if (cookie.contains("=")) {
+            String[] cookieParts = cookie.split("=");
+            String name = cookieParts[0].trim();
+            String val = cookieParts[1].trim();
+            cookies.put(name, val);
+        }
+    }
+
+    public void paintRequest(HttpUriRequestBase request) {
+        request.addHeader("Cookie", createCookieHeader());
+    }
+
+    public void paintRequest(ClassicHttpRequest request) {
+        request.addHeader("Cookie", createCookieHeader());
+    }
+
+    private String createCookieHeader() {
+        StringJoiner sj = new StringJoiner("; ");
+        for (var cookie : cookies.entrySet()) {
+            sj.add(cookie.getKey() + "=" + cookie.getValue());
+        }
+        return sj.toString();
+    }
+
+}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java
@@ -23,6 +23,7 @@ public interface HttpFetcher extends AutoCloseable {

    HttpFetchResult fetchContent(EdgeUrl url,
                                 WarcRecorder recorder,
+                                 DomainCookies cookies,
                                 CrawlDelayTimer timer,
                                 ContentTags tags,
                                 ProbeType probeType);
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
 import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
 import org.apache.hc.client5.http.HttpRequestRetryStrategy;
 import org.apache.hc.client5.http.classic.HttpClient;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.config.ConnectionConfig;
 import org.apache.hc.client5.http.config.RequestConfig;
 import org.apache.hc.client5.http.cookie.BasicCookieStore;
@@ -34,6 +35,7 @@ import org.apache.hc.core5.http.io.entity.EntityUtils;
 import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.apache.hc.core5.http.message.MessageSupport;
 import org.apache.hc.core5.http.protocol.HttpContext;
+import org.apache.hc.core5.pool.PoolStats;
 import org.apache.hc.core5.util.TimeValue;
 import org.apache.hc.core5.util.Timeout;
 import org.jsoup.Jsoup;
@@ -45,11 +47,13 @@ import org.slf4j.Marker;
 import org.slf4j.MarkerFactory;

 import javax.net.ssl.SSLContext;
+import javax.net.ssl.SSLException;
 import java.io.IOException;
 import java.net.SocketTimeoutException;
 import java.net.URISyntaxException;
 import java.security.NoSuchAlgorithmException;
 import java.time.Duration;
+import java.time.Instant;
 import java.util.*;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
@@ -76,14 +80,20 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
    }

    private final CloseableHttpClient client;
+    private PoolingHttpClientConnectionManager connectionManager;
+
+    public PoolStats getPoolStats() {
+        return connectionManager.getTotalStats();
+    }

    private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
        final ConnectionConfig connectionConfig = ConnectionConfig.custom()
                .setSocketTimeout(10, TimeUnit.SECONDS)
                .setConnectTimeout(30, TimeUnit.SECONDS)
+                .setValidateAfterInactivity(TimeValue.ofSeconds(5))
                .build();

-        final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
+        connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
                .setMaxConnPerRoute(2)
                .setMaxConnTotal(5000)
                .setDefaultConnectionConfig(connectionConfig)
@@ -91,15 +101,27 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
                .build();

        connectionManager.setDefaultSocketConfig(SocketConfig.custom()
-                .setSoLinger(TimeValue.ofSeconds(15))
+                .setSoLinger(TimeValue.ofSeconds(-1))
                .setSoTimeout(Timeout.ofSeconds(10))
                .build()
        );

+        Thread.ofPlatform().daemon(true).start(() -> {
+            try {
+                for (;;) {
+                    TimeUnit.SECONDS.sleep(15);
+                    logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
+                }
+            }
+            catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        });
+
        final RequestConfig defaultRequestConfig = RequestConfig.custom()
                .setCookieSpec(StandardCookieSpec.RELAXED)
                .setResponseTimeout(10, TimeUnit.SECONDS)
-                .setConnectionRequestTimeout(8, TimeUnit.SECONDS)
+                .setConnectionRequestTimeout(5, TimeUnit.MINUTES)
                .build();

        return HttpClients.custom()
@@ -287,6 +309,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
     * recorded in the WARC file on failure.
     */
    public ContentTypeProbeResult probeContentType(EdgeUrl url,
+                                                   DomainCookies cookies,
                                                   CrawlDelayTimer timer,
                                                   ContentTags tags) {
        if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
@@ -299,9 +322,11 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
                    .addHeader("Accept-Encoding", "gzip")
                    .build();

-            var result = SendLock.wrapSend(client, head, (rsp) -> {
-                EntityUtils.consume(rsp.getEntity());
+            cookies.paintRequest(head);

+            return SendLock.wrapSend(client, head, (rsp) -> {
+                cookies.updateCookieStore(rsp);
+                EntityUtils.consume(rsp.getEntity());
                int statusCode = rsp.getCode();

                // Handle redirects
@@ -339,8 +364,6 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
                    return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
                }
            });
-
-            return result;
        }
        catch (SocketTimeoutException ex) {

@@ -362,6 +385,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
    @Override
    public HttpFetchResult fetchContent(EdgeUrl url,
                                           WarcRecorder warcRecorder,
+                                           DomainCookies cookies,
                                           CrawlDelayTimer timer,
                                           ContentTags contentTags,
                                           ProbeType probeType)
@@ -369,26 +393,32 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
        try {
            if (probeType == HttpFetcher.ProbeType.FULL) {
                try {
-                    var probeResult = probeContentType(url, timer, contentTags);
-                    logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
+                    var probeResult = probeContentType(url, cookies, timer, contentTags);
+
                    switch (probeResult) {
                        case HttpFetcher.ContentTypeProbeResult.NoOp():
                            break; //
                        case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
+                            logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
                            url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
                            break;
                        case ContentTypeProbeResult.BadContentType badContentType:
                            warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
+                            logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
                            return new HttpFetchResult.ResultNone();
                        case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
+                            logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
                            warcRecorder.flagAsTimeout(url);
                            return new HttpFetchResult.ResultException(ex);
                        case ContentTypeProbeResult.Exception(Exception ex):
+                            logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
                            warcRecorder.flagAsError(url, ex);
                            return new HttpFetchResult.ResultException(ex);
                        case ContentTypeProbeResult.HttpError httpError:
+                            logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
                            return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
                        case ContentTypeProbeResult.Redirect redirect:
+                            logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
                            return new HttpFetchResult.ResultRedirect(redirect.location());
                    }
                } catch (Exception ex) {
@@ -398,36 +428,41 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {

            }

-            ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI())
-                    .addHeader("User-Agent", userAgentString)
-                    .addHeader("Accept-Encoding", "gzip")
-                    .addHeader("Accept-Language", "en,*;q=0.5")
-                    .addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
+            HttpGet request = new HttpGet(url.asURI());
+            request.addHeader("User-Agent", userAgentString);
+            request.addHeader("Accept-Encoding", "gzip");
+            request.addHeader("Accept-Language", "en,*;q=0.5");
+            request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");

-            contentTags.paint(getBuilder);
+            contentTags.paint(request);

            try (var sl = new SendLock()) {
-                HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
+                Instant start = Instant.now();
+                HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
+
+                Duration fetchDuration = Duration.between(start, Instant.now());

                if (result instanceof HttpFetchResult.ResultOk ok) {
                    if (ok.statusCode() == 304) {
-                        return new HttpFetchResult.Result304Raw();
+                        result = new HttpFetchResult.Result304Raw();
                    }
                }

                switch (result) {
-                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
+                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
                    case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {}  for {}", redirect.url(), url);
-                    case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none  for {}", url);
-                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception: {}  for {}", ex.getClass().getSimpleName(), url);
+                    case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
+                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
                    case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
                    case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
                }
+
                return result;
            }
        }
        catch (Exception ex) {
-            ex.printStackTrace();
+            logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
+
            return new HttpFetchResult.ResultException(ex);
        }

@@ -494,56 +529,61 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
    }


-    private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
-        ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI())
-                .addHeader("User-Agent", userAgentString)
-                .addHeader("Accept-Encoding", "gzip")
-                .addHeader("Accept", "text/*, */*;q=0.9")
-                .addHeader("User-Agent", userAgentString)
-                .build();
+    private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
+        HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
+
+        getRequest.addHeader("User-Agent", userAgentString);
+        getRequest.addHeader("Accept-Encoding", "gzip");
+        getRequest.addHeader("Accept", "text/*, */*;q=0.9");
+        getRequest.addHeader("User-Agent", userAgentString);

        try (var sl = new SendLock()) {
            return client.execute(getRequest, response -> {
-                if (response.getCode() != 200) {
-                    return new SitemapResult.SitemapError();
+                try {
+                    if (response.getCode() != 200) {
+                        return new SitemapResult.SitemapError();
+                    }
+
+                    Document parsedSitemap = Jsoup.parse(
+                            EntityUtils.toString(response.getEntity()),
+                            sitemapUrl.toString(),
+                            Parser.xmlParser()
+                    );
+
+                    if (parsedSitemap.childrenSize() == 0) {
+                        return new SitemapResult.SitemapError();
+                    }
+
+                    String rootTagName = parsedSitemap.child(0).tagName();
+
+                    return switch (rootTagName.toLowerCase()) {
+                        case "sitemapindex" -> {
+                            List<String> references = new ArrayList<>();
+                            for (var locTag : parsedSitemap.getElementsByTag("loc")) {
+                                references.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
+                        }
+                        case "urlset" -> {
+                            List<String> urls = new ArrayList<>();
+                            for (var locTag : parsedSitemap.select("url > loc")) {
+                                urls.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
+                        }
+                        case "rss", "atom" -> {
+                            List<String> urls = new ArrayList<>();
+                            for (var locTag : parsedSitemap.select("link, url")) {
+                                urls.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
+                        }
+                        default -> new SitemapResult.SitemapError();
+                    };
                }
-
-                Document parsedSitemap = Jsoup.parse(
-                        EntityUtils.toString(response.getEntity()),
-                        sitemapUrl.toString(),
-                        Parser.xmlParser()
-                );
-
-                if (parsedSitemap.childrenSize() == 0) {
-                    return new SitemapResult.SitemapError();
+                finally {
+                    EntityUtils.consume(response.getEntity());
                }
-
-                String rootTagName = parsedSitemap.child(0).tagName();
-
-                return switch (rootTagName.toLowerCase()) {
-                    case "sitemapindex" -> {
-                        List<String> references = new ArrayList<>();
-                        for (var locTag : parsedSitemap.getElementsByTag("loc")) {
-                            references.add(locTag.text().trim());
-                        }
-                        yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
-                    }
-                    case "urlset" -> {
-                        List<String> urls = new ArrayList<>();
-                        for (var locTag : parsedSitemap.select("url > loc")) {
-                            urls.add(locTag.text().trim());
-                        }
-                        yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
-                    }
-                    case "rss", "atom" -> {
-                        List<String> urls = new ArrayList<>();
-                        for (var locTag : parsedSitemap.select("link, url")) {
-                            urls.add(locTag.text().trim());
-                        }
-                        yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
-                    }
-                    default -> new SitemapResult.SitemapError();
-                };
            });
        }
        catch (Exception ex) {
@@ -574,13 +614,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
    private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
        try (var sl = new SendLock()) {

-            ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI())
-                    .addHeader("User-Agent", userAgentString)
-                    .addHeader("Accept-Encoding", "gzip")
-                    .addHeader("Accept", "text/*, */*;q=0.9")
-                    .build();
+            HttpGet request = new HttpGet(url.asURI());
+            request.addHeader("User-Agent", userAgentString);
+            request.addHeader("Accept-Encoding", "gzip");
+            request.addHeader("Accept", "text/*, */*;q=0.9");

-            HttpFetchResult result = recorder.fetch(client, request);
+            HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);

            return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
                robotsParser.parseContent(url.toString(),
@@ -596,18 +635,21 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {

    @Override
    public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
-        if (exception instanceof SocketTimeoutException ex) {
+        if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
+            return false;
+        }
+        if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
            return false;
        }

-        return executionCount < 3;
+        return executionCount <= 3;
    }

    @Override
    public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
        return switch (response.getCode()) {
-            case 500, 503 -> executionCount < 2;
-            case 429 -> executionCount < 3;
+            case 500, 503 -> executionCount <= 2;
+            case 429 -> executionCount <= 3;
            default -> false;
        };
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.BOMInputStream;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.core5.http.ClassicHttpResponse;
 import org.apache.hc.core5.http.Header;
 import org.netpreserve.jwarc.WarcTruncationReason;
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
     *  and suppressed from the headers.
     *  If an error occurs, a buffer will be created with no content and an error status.
     */
-    static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException {
+    static WarcInputBuffer forResponse(ClassicHttpResponse response,
+                                       HttpGet request,
+                                       Duration timeLimit) throws IOException {
        if (response == null)
            return new ErrorBuffer();

@@ -54,16 +57,29 @@ public abstract class WarcInputBuffer implements AutoCloseable {
            return new ErrorBuffer();
        }

-        InputStream is = entity.getContent();
-        long length = entity.getContentLength();
+        InputStream is = null;
+        try {
+            is = entity.getContent();
+            long length = entity.getContentLength();

-        try (response) {
            if (length > 0 && length < 8192) {
                // If the content is small and not compressed, we can just read it into memory
-                return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length);
+                return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
            } else {
                // Otherwise, we unpack it into a file and read it from there
-                return new FileBuffer(response.getHeaders(), timeLimit, is);
+                return new FileBuffer(response.getHeaders(), request, timeLimit, is);
+            }
+        }
+        finally {
+            try {
+                is.skip(Long.MAX_VALUE);
+            }
+            catch (IOException e) {
+                // Ignore the exception
+            }
+            finally {
+                // Close the input stream
+                IOUtils.closeQuietly(is);
            }
        }

@@ -71,7 +87,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
    }

    /** Copy an input stream to an output stream, with a maximum size and time limit */
-    protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
+    protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
        Instant start = Instant.now();
        Instant timeout = start.plus(timeLimit);
        long size = 0;
@@ -86,6 +102,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
                Duration remaining = Duration.between(Instant.now(), timeout);
                if (remaining.isNegative()) {
                    truncationReason = WarcTruncationReason.TIME;
+                    // Abort the request if the time limit is exceeded
+                    // so we don't keep the connection open forever or are forced to consume
+                    // the stream to the end
+
+                    request.abort();
                    break;
                }

@@ -104,6 +125,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
                }
                else if (truncationReason != WarcTruncationReason.LENGTH) {
                    truncationReason = WarcTruncationReason.LENGTH;
+                    break;
                }

            } catch (IOException e) {
@@ -111,13 +133,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
            }
        }

-        // Try to close the connection as long as we haven't timed out.
-        // As per Apache HttpClient's semantics, this will reset the connection
-        // and close the stream if we have timed out.
-
-        if (truncationReason != WarcTruncationReason.TIME) {
-            IOUtils.closeQuietly(is);
-        }
    }

    /** Takes a Content-Range header and checks if it is complete.
@@ -218,7 +233,7 @@ class ErrorBuffer extends WarcInputBuffer {
 /** Buffer for when we have the response in memory */
 class MemoryBuffer extends WarcInputBuffer {
    byte[] data;
-    public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) {
+    public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
        super(suppressContentEncoding(headers));

        if (!isRangeComplete(headers)) {
@@ -229,7 +244,7 @@ class MemoryBuffer extends WarcInputBuffer {

        var outputStream = new ByteArrayOutputStream(size);

-        copy(responseStream, outputStream, timeLimit);
+        copy(responseStream, request, outputStream, timeLimit);

        data = outputStream.toByteArray();
    }
@@ -253,7 +268,7 @@ class MemoryBuffer extends WarcInputBuffer {
 class FileBuffer extends WarcInputBuffer {
    private final Path tempFile;

-    public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException {
+    public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
        super(suppressContentEncoding(headers));

        if (!isRangeComplete(headers)) {
@@ -265,7 +280,7 @@ class FileBuffer extends WarcInputBuffer {
        this.tempFile = Files.createTempFile("rsp", ".html");

        try (var out = Files.newOutputStream(tempFile)) {
-            copy(responseStream, out, timeLimit);
+            copy(responseStream, request, out, timeLimit);
        }
        catch (Exception ex) {
            truncationReason = WarcTruncationReason.UNSPECIFIED;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@@ -1,6 +1,7 @@
 package nu.marginalia.crawl.fetcher.warc;

 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.link_parser.LinkParser;
@@ -8,9 +9,7 @@ import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
-import org.apache.hc.client5.http.cookie.CookieStore;
-import org.apache.hc.core5.http.ClassicHttpRequest;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.core5.http.NameValuePair;
 import org.jetbrains.annotations.Nullable;
 import org.netpreserve.jwarc.*;
@@ -42,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
    static final int MAX_TIME = 30_000;

    /** Maximum (decompressed) size we'll save */
-    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
+    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);

    private final WarcWriter writer;
    private final Path warcFile;
@@ -53,23 +52,15 @@ public class WarcRecorder implements AutoCloseable {
    // Affix a version string in case we need to change the format in the future
    // in some way
    private final String warcRecorderVersion = "1.0";
-    private final CookieStore cookies;
    private final LinkParser linkParser = new LinkParser();
    /**
     * Create a new WarcRecorder that will write to the given file
     *
     * @param warcFile The file to write to
     */
-    public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
+    public WarcRecorder(Path warcFile) throws IOException {
        this.warcFile = warcFile;
        this.writer = new WarcWriter(warcFile);
-        this.cookies = fetcher.getCookies();
-    }
-
-    public WarcRecorder(Path warcFile, CookieStore cookies) throws IOException {
-        this.warcFile = warcFile;
-        this.writer = new WarcWriter(warcFile);
-        this.cookies = cookies;
    }

    /**
@@ -79,24 +70,21 @@ public class WarcRecorder implements AutoCloseable {
    public WarcRecorder() throws IOException {
        this.warcFile = Files.createTempFile("warc", ".warc.gz");
        this.writer = new WarcWriter(this.warcFile);
-        this.cookies = new BasicCookieStore();

        temporaryFile = true;
    }

-    private boolean hasCookies() {
-        return !cookies.getCookies().isEmpty();
-    }
-
    public HttpFetchResult fetch(HttpClient client,
-                                 ClassicHttpRequest request)
+                                 DomainCookies cookies,
+                                 HttpGet request)
            throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
    {
-        return fetch(client, request, Duration.ofMillis(MAX_TIME));
+        return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
    }

    public HttpFetchResult fetch(HttpClient client,
-                                 ClassicHttpRequest request,
+                                 DomainCookies cookies,
+                                 HttpGet request,
                                 Duration timeout)
            throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
    {
@@ -113,13 +101,15 @@ public class WarcRecorder implements AutoCloseable {
        // Inject a range header to attempt to limit the size of the response
        // to the maximum size we want to store, if the server supports it.
        request.addHeader("Range", "bytes=0-"+MAX_SIZE);
-
+        cookies.paintRequest(request);
        try {
-            return client.execute(request, response -> {
+            return client.execute(request,response -> {

-                try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout);
+                try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
                     InputStream inputStream = inputBuffer.read()) {

+                    cookies.updateCookieStore(response);
+
                    // Build and write the request

                    WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
@@ -143,8 +133,9 @@ public class WarcRecorder implements AutoCloseable {
                    warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
                    writer.write(warcRequest);

-                    if (hasCookies()) {
-                        extraHeaders.put("X-Has-Cookies", List.of("1"));
+
+                    if (cookies.hasCookies()) {
+                        response.addHeader("X-Has-Cookies", 1);
                    }

                    byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
@@ -259,7 +250,7 @@ public class WarcRecorder implements AutoCloseable {
        writer.write(item);
    }

-    private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
+    private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
        try {
            WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
            WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
@@ -320,7 +311,7 @@ public class WarcRecorder implements AutoCloseable {
                    .date(Instant.now())
                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());

-            if (hasCookies()) {
+            if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
                builder.addHeader("X-Has-Cookies", "1");
            }

@@ -340,8 +331,8 @@ public class WarcRecorder implements AutoCloseable {
     * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified.  In this
     * scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
     */
-    public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
-        saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
+    public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
+        saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
    }

    public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
 import nu.marginalia.model.EdgeDomain;

 import java.util.Map;
+import java.util.Optional;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Semaphore;

@@ -19,8 +20,22 @@ public class DomainLocks {
     * and may be held by another thread.  The caller is responsible for locking and  releasing the lock.
     */
    public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
-        return new DomainLock(domain.toString(),
-                locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
+        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
+
+        sem.acquire();
+
+        return new DomainLock(sem);
+    }
+
+    public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
+        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
+        if (sem.tryAcquire(1)) {
+            return Optional.of(new DomainLock(sem));
+        }
+        else {
+            // We don't have a lock, so we return an empty optional
+            return Optional.empty();
+        }
    }

    private Semaphore defaultPermits(String topDomain) {
@@ -28,23 +43,27 @@ public class DomainLocks {
            return new Semaphore(16);
        if (topDomain.equals("blogspot.com"))
            return new Semaphore(8);
-
+        if (topDomain.equals("tumblr.com"))
+            return new Semaphore(8);
        if (topDomain.equals("neocities.org"))
-            return new Semaphore(4);
+            return new Semaphore(8);
        if (topDomain.equals("github.io"))
-            return new Semaphore(4);
+            return new Semaphore(8);

+        // Substack really dislikes broad-scale crawlers, so we need to be careful
+        // to not get blocked.
        if (topDomain.equals("substack.com")) {
            return new Semaphore(1);
        }
-        if (topDomain.endsWith(".edu")) {
-            return new Semaphore(1);
-        }

        return new Semaphore(2);
    }

-    public boolean canLock(EdgeDomain domain) {
+    /** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
+     * (this is just a hint, and does not guarantee that the domain is actually lockable any time
+     * after this method returns true)
+     */
+    public boolean isLockableHint(EdgeDomain domain) {
        Semaphore sem = locks.get(domain.topDomain.toLowerCase());
        if (null == sem)
            return true;
@@ -53,22 +72,16 @@ public class DomainLocks {
    }

    public static class DomainLock implements AutoCloseable {
-        private final String domainName;
        private final Semaphore semaphore;

-        DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
-            this.domainName = domainName;
+        DomainLock(Semaphore semaphore) {
            this.semaphore = semaphore;
-
-            Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
-            semaphore.acquire();
-            Thread.currentThread().setName("crawling:" + domainName);
        }

        @Override
        public void close() throws Exception {
            semaphore.release();
-            Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
+            Thread.currentThread().setName("[idle]");
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -6,6 +6,7 @@ import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.logic.LinkFilterSelector;
@@ -51,9 +52,10 @@ public class CrawlerRetreiver implements AutoCloseable {
    private final DomainStateDb domainStateDb;
    private final WarcRecorder warcRecorder;
    private final CrawlerRevisitor crawlerRevisitor;
+    private final DomainCookies cookies = new DomainCookies();

    private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
-            Duration.ofSeconds(1) // pace the connections to avoid network congestion by waiting 1 second between establishing them
+            Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
    );

    int errorCount = 0;
@@ -124,7 +126,7 @@ public class CrawlerRetreiver implements AutoCloseable {
                    }

                    Instant recrawlStart = Instant.now();
-                    CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
+                    CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
                    Duration recrawlTime = Duration.between(recrawlStart, Instant.now());

                    // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
@@ -274,7 +276,7 @@ public class CrawlerRetreiver implements AutoCloseable {
        try {
            var url = rootUrl.withPathAndParam("/", null);

-            HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+            HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
            timer.waitFetchDelay(0);

            if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
@@ -337,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {

            // Grab the favicon if it exists

-            if (fetcher.fetchContent(faviconUrl, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
+            if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
                String contentType = iconResult.header("Content-Type");
                byte[] iconData = iconResult.getBodyBytes();

@@ -407,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
        if (parsedOpt.isEmpty())
            return false;

-        HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
        timer.waitFetchDelay(0);

        if (!(result instanceof HttpFetchResult.ResultOk ok)) {
@@ -435,7 +437,7 @@ public class CrawlerRetreiver implements AutoCloseable {
    {
        var contentTags = reference.getContentTags();

-        HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, timer, contentTags, HttpFetcher.ProbeType.FULL);
+        HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
        timer.waitFetchDelay();

        if (Thread.interrupted()) {
@@ -461,7 +463,7 @@ public class CrawlerRetreiver implements AutoCloseable {
                {
                    var doc = reference.doc();

-                    warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
+                    warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);

                    fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
                            new ContentType(doc.contentType, "UTF-8"),
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;

 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@@ -37,6 +38,7 @@ public class CrawlerRevisitor {

    /** Performs a re-crawl of old documents, comparing etags and last-modified */
    public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
+                       DomainCookies cookies,
                       SimpleRobotRules robotsRules,
                       CrawlDelayTimer delayTimer)
    throws InterruptedException {
@@ -132,6 +134,7 @@ public class CrawlerRevisitor {
                }
                // Add a WARC record so we don't repeat this
                warcRecorder.writeReferenceCopy(url,
+                        cookies,
                        doc.contentType,
                        doc.httpStatus,
                        doc.documentBodyBytes,
--- a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -6,6 +6,7 @@ public class ContentTypes {
    public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
            "application/xhtml",
            "text/html",
+            "application/pdf",
            "image/x-icon",
            "text/plain");

@@ -19,4 +20,9 @@ public class ContentTypes {
        return false;
    }

+    public static boolean isBinary(String contentTypeHeader) {
+        String lcHeader = contentTypeHeader.toLowerCase();
+        return lcHeader.startsWith("application/pdf");
+    }
+
 }
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
            public boolean filter(String url, int status, String contentType) {
                String ctLc = contentType.toLowerCase();

+                // Permit all plain text content types
                if (ctLc.startsWith("text/"))
                    return true;
+                // PDF
+                else if (ctLc.startsWith("application/pdf"))
+                    return true;
                else if (ctLc.startsWith("x-marginalia/"))
                    return true;

--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;

 public class ContentTypeLogic {

-    private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
+    private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
    private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
    private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
    private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
            "application/rss+xml",
            "application/x-rss+xml",
            "application/rdf+xml",
+            "application/pdf",
            "x-rss+xml"
    );
    private boolean allowAllContentTypes = false;
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
    public boolean isUrlLikeBinary(EdgeUrl url) {
        String pathLowerCase = url.path.toLowerCase();

-        if (probableHtmlPattern.test(pathLowerCase))
+        if (probableGoodPattern.test(pathLowerCase))
            return false;

        return probableBinaryPattern.test(pathLowerCase);
--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
            return false;
        }

+        // If the format is binary, we don't want to translate it if the response is truncated
+        if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
+            return false;
+        }
+
        return true;
    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplContentTypeProbeTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplContentTypeProbeTest.java
@@ -11,6 +11,8 @@ import org.junit.jupiter.api.*;
 import java.io.IOException;
 import java.net.URISyntaxException;

+import static org.junit.jupiter.api.Assertions.assertEquals;
+
@Tag("slow")
 class HttpFetcherImplContentTypeProbeTest {

@@ -85,55 +87,59 @@ class HttpFetcherImplContentTypeProbeTest {

    @AfterEach
    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
        fetcher.close();
    }

    @Test
    public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
-        var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
-        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
+        var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
    }


    @Test
    public void testProbeContentTypeHtmlShortcircuitTags() {
-        var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
-        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
+        var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), new ContentTags("a", "b"));
+        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
    }

    @Test
    public void testProbeContentTypeHtml() {
-        var result = fetcher.probeContentType(contentTypeHtmlUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(contentTypeHtmlUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
    }

    @Test
    public void testProbeContentTypeBinary() {
-        var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
    }

    @Test
    public void testProbeContentTypeRedirect() {
-        var result = fetcher.probeContentType(redirectUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(redirectUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
    }

    @Test
    public void testProbeContentTypeBadHttpStatus() {
-        var result = fetcher.probeContentType(badHttpStatusUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(badHttpStatusUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
    }

    @Test
    public void testOnlyGetAllowed() {
-        var result = fetcher.probeContentType(onlyGetAllowedUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(onlyGetAllowedUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
    }

    @Test
    public void testTimeout() {
-        var result = fetcher.probeContentType(timeoutUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(timeoutUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplDomainProbeTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplDomainProbeTest.java
@@ -12,6 +12,8 @@ import org.junit.jupiter.api.*;
 import java.io.IOException;
 import java.net.URISyntaxException;

+import static org.junit.jupiter.api.Assertions.assertEquals;
+
@Tag("slow")
 class HttpFetcherImplDomainProbeTest {

@@ -47,6 +49,10 @@ class HttpFetcherImplDomainProbeTest {

    @AfterEach
    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
        fetcher.close();
    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
@@ -31,6 +31,7 @@ class HttpFetcherImplFetchTest {
    private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";

    private static EdgeUrl okUrl;
+    private static EdgeUrl okUrlSetsCookie;
    private static EdgeUrl okRangeResponseUrl;
    private static EdgeUrl okUrlWith304;

@@ -39,6 +40,8 @@ class HttpFetcherImplFetchTest {
    private static EdgeUrl badHttpStatusUrl;
    private static EdgeUrl keepAliveUrl;

+    private static EdgeUrl pdfUrl;
+
    @BeforeAll
    public static void setupAll() throws URISyntaxException {
        wireMockServer =
@@ -88,6 +91,19 @@ class HttpFetcherImplFetchTest {
                        .withStatus(200)
                        .withBody("Hello World")));

+        okUrlSetsCookie = new EdgeUrl("http://localhost:18089/okSetCookie.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlSetsCookie.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withHeader("Set-Cookie", "test=1")
+                        .withStatus(200)));
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlSetsCookie.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withHeader("Set-Cookie", "test=1")
+                        .withStatus(200)
+                        .withBody("Hello World")));
+
        okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
                .willReturn(WireMock.aResponse()
@@ -117,6 +133,15 @@ class HttpFetcherImplFetchTest {
                        .withHeader("Keep-Alive", "max=4, timeout=30")
                        .withBody("Hello")
                        ));
+
+
+        pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "application/pdf")
+                        .withStatus(200)
+                        .withBody("Hello World")));
+
        wireMockServer.start();

    }
@@ -134,20 +159,31 @@ class HttpFetcherImplFetchTest {
    public void setUp() throws IOException {
        fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
        warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
-        warcRecorder = new WarcRecorder(warcFile, fetcher);
+        warcRecorder = new WarcRecorder(warcFile);
    }

    @AfterEach
    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
+        System.out.println(stats);
+
        fetcher.close();
        warcRecorder.close();
        Files.deleteIfExists(warcFile);
    }


+    @Test
+    public void testFoo() {
+        fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
+    }
+
    @Test
    public void testOk_NoProbe() throws IOException {
-        var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
@@ -158,12 +194,29 @@ class HttpFetcherImplFetchTest {
        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));

        WarcResponse response = (WarcResponse) warcRecords.get(1);
-        assertEquals("0", response.headers().first("X-Has-Cookies").orElse("0"));
+        assertEquals("0", response.http().headers().first("X-Has-Cookies").orElse("0"));
+    }
+
+    @Test
+    public void testOkSetsCookie() throws IOException {
+        var cookies = new DomainCookies();
+        var result = fetcher.fetchContent(okUrlSetsCookie, warcRecorder, cookies, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+
+        List<WarcRecord> warcRecords = getWarcRecords();
+        assertEquals(2, warcRecords.size());
+        Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
+        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
+
+        WarcResponse response = (WarcResponse) warcRecords.get(1);
+        assertEquals("1", response.http().headers().first("X-Has-Cookies").orElse("0"));
    }

    @Test
    public void testOk_FullProbe() {
-        var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
@@ -171,7 +224,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testOk304_NoProbe() {
-        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
        System.out.println(result);
@@ -180,7 +233,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testOk304_FullProbe() {
-        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
        System.out.println(result);
@@ -188,7 +241,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testBadStatus_NoProbe() throws IOException {
-        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertFalse(result.isOk());
@@ -202,7 +255,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testBadStatus_FullProbe() {
-        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertFalse(result.isOk());
@@ -212,7 +265,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testRedirect_NoProbe() throws URISyntaxException, IOException {
-        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
        assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
@@ -225,7 +278,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testRedirect_FullProbe() throws URISyntaxException {
-        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
        assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
@@ -238,7 +291,7 @@ class HttpFetcherImplFetchTest {
    public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
        Instant requestStart = Instant.now();

-        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);

@@ -262,7 +315,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testRangeResponse() throws IOException {
-        var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
@@ -279,7 +332,7 @@ class HttpFetcherImplFetchTest {
    @Test
    public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
        Instant requestStart = Instant.now();
-        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
        Instant requestEnd = Instant.now();

        Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
@@ -302,7 +355,15 @@ class HttpFetcherImplFetchTest {
    @Test
    public void testKeepaliveUrl() {
        // mostly for smoke testing and debugger utility
-        var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+    }
+
+    @Test
+    public void testPdf() {
+        var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
@@ -319,6 +380,13 @@ class HttpFetcherImplFetchTest {
            WarcXEntityRefused.register(reader);

            for (var record : reader) {
+                // Load the body, we need to do this before we close the reader to have access to the content.
+                if (record instanceof WarcRequest req) {
+                    req.http();
+                } else if (record instanceof WarcResponse rsp) {
+                    rsp.http();
+                }
+
                records.add(record);
            }
        }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
@@ -1,12 +1,12 @@
 package nu.marginalia.crawl.retreival;

+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
-import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {

    @Test
    void run() throws IOException, URISyntaxException {
-        try (var oldRecorder = new WarcRecorder(fileName, new BasicCookieStore())) {
+        try (var oldRecorder = new WarcRecorder(fileName)) {
            fetchUrl(oldRecorder, "https://www.marginalia.nu/");
            fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
            fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {

        var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);

-        try (var newRecorder = new WarcRecorder(outputFile, new BasicCookieStore())) {
+        try (var newRecorder = new WarcRecorder(outputFile)) {
            new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
        }

@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
    }

    void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        var req = ClassicRequestBuilder.get(new java.net.URI(url))
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .build();
-        recorder.fetch(httpClient, req);
+        HttpGet request = new HttpGet(url);
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+
+        recorder.fetch(httpClient, new DomainCookies(), request);
    }
 }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;

 import com.sun.net.httpserver.HttpServer;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@@ -88,7 +89,7 @@ class ContentTypeProberTest {

    @Test
    void probeContentTypeOk() throws Exception {
-        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
+        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());

        System.out.println(result);

@@ -97,7 +98,7 @@ class ContentTypeProberTest {

    @Test
    void probeContentTypeRedir() throws Exception {
-        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
+        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());

        System.out.println(result);

@@ -106,7 +107,7 @@ class ContentTypeProberTest {

    @Test
    void probeContentTypeBad() throws Exception {
-        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
+        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());

        System.out.println(result);

@@ -115,7 +116,7 @@ class ContentTypeProberTest {

    @Test
    void probeContentTypeTimeout() throws Exception {
-        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
+        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());

        System.out.println(result);

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderFakeServerTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderFakeServerTest.java
@@ -1,11 +1,11 @@
 package nu.marginalia.crawl.retreival.fetcher;

 import com.sun.net.httpserver.HttpServer;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
-import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.junit.jupiter.api.*;
 import org.netpreserve.jwarc.WarcReader;
 import org.netpreserve.jwarc.WarcRequest;
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
                os.write("<html><body>hello</body></html>".getBytes());
                os.flush();
                try {
-                    TimeUnit.SECONDS.sleep(1);
+                    TimeUnit.SECONDS.sleep(2);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
                os.write(":".getBytes());
                os.flush();
                try {
-                    TimeUnit.SECONDS.sleep(1);
+                    TimeUnit.SECONDS.sleep(2);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
@@ -89,24 +89,22 @@ class WarcRecorderFakeServerTest {
        fileNameWarc = Files.createTempFile("test", ".warc");
        fileNameParquet = Files.createTempFile("test", ".parquet");

-        client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
+        client = new WarcRecorder(fileNameWarc);
    }

    @AfterEach
    public void tearDown() throws Exception {
+
        client.close();
        Files.delete(fileNameWarc);
    }

    @Test
    public void fetchFast() throws Exception {
-        client.fetch(httpClient,
-                ClassicRequestBuilder
-                        .get(new java.net.URI("http://localhost:14510/fast"))
-                        .addHeader("User-agent", "test.marginalia.nu")
-                        .addHeader("Accept-Encoding", "gzip")
-                        .build()
-        );
+        HttpGet request = new HttpGet("http://localhost:14510/fast");
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+        client.fetch(httpClient, new DomainCookies(), request);

        Map<String, String> sampleData = new HashMap<>();
        try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -127,11 +125,13 @@ class WarcRecorderFakeServerTest {
    public void fetchSlow() throws Exception {
        Instant start = Instant.now();

+        HttpGet request = new HttpGet("http://localhost:14510/slow");
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+
        client.fetch(httpClient,
-                ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow"))
-                        .addHeader("User-agent", "test.marginalia.nu")
-                        .addHeader("Accept-Encoding", "gzip")
-                        .build(),
+                new DomainCookies(),
+                request,
                Duration.ofSeconds(1)
        );
        Instant end = Instant.now();
@@ -149,6 +149,8 @@ class WarcRecorderFakeServerTest {
            });
        }

+        System.out.println(
+                Files.readString(fileNameWarc));
        System.out.println(sampleData);

        // Timeout is set to 1 second, but the server will take 5 seconds to respond,
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@@ -2,14 +2,14 @@ package nu.marginalia.crawl.retreival.fetcher;

 import nu.marginalia.UserAgent;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
-import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
+import nu.marginalia.slop.SlopCrawlDataRecord;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
-import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -24,13 +24,14 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.NoSuchAlgorithmException;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;

 import static org.junit.jupiter.api.Assertions.assertEquals;

 class WarcRecorderTest {
    Path fileNameWarc;
-    Path fileNameParquet;
+    Path fileNameSlop;
    WarcRecorder client;

    HttpClient httpClient;
@@ -39,9 +40,9 @@ class WarcRecorderTest {
        httpClient = HttpClients.createDefault();

        fileNameWarc = Files.createTempFile("test", ".warc");
-        fileNameParquet = Files.createTempFile("test", ".parquet");
+        fileNameSlop = Files.createTempFile("test", ".slop.zip");

-        client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
+        client = new WarcRecorder(fileNameWarc);
    }

    @AfterEach
@@ -52,12 +53,12 @@ class WarcRecorderTest {

    @Test
    void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        client.fetch(httpClient,
-                ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/"))
-                        .addHeader("User-agent", "test.marginalia.nu")
-                        .addHeader("Accept-Encoding", "gzip")
-                        .build()
-        );
+
+        HttpGet request = new HttpGet("https://www.marginalia.nu/");
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+
+        client.fetch(httpClient, new DomainCookies(), request);

        Map<String, String> sampleData = new HashMap<>();
        try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -78,8 +79,9 @@ class WarcRecorderTest {
    @Test
    public void flagAsSkipped() throws IOException, URISyntaxException {

-        try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
+        try (var recorder = new WarcRecorder(fileNameWarc)) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
+                    new DomainCookies(),
                    "text/html",
                    200,
                    "<?doctype html><html><body>test</body></html>".getBytes(),
@@ -102,8 +104,9 @@ class WarcRecorderTest {
    @Test
    public void flagAsSkippedNullBody() throws IOException, URISyntaxException {

-        try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
+        try (var recorder = new WarcRecorder(fileNameWarc)) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
+                    new DomainCookies(),
                    "text/html",
                    200,
                    null,
@@ -114,8 +117,9 @@ class WarcRecorderTest {

    @Test
    public void testSaveImport() throws URISyntaxException, IOException {
-        try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
+        try (var recorder = new WarcRecorder(fileNameWarc)) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
+                    new DomainCookies(),
                    "text/html",
                    200,
                    "<?doctype html><html><body>test</body></html>".getBytes(),
@@ -138,35 +142,46 @@ class WarcRecorderTest {

    @Test
    public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        client.fetch(httpClient, ClassicRequestBuilder
-                .get(new java.net.URI("https://www.marginalia.nu/"))
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .build());
+        HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
+        request1.addHeader("User-agent", "test.marginalia.nu");
+        request1.addHeader("Accept-Encoding", "gzip");

-        client.fetch(httpClient, ClassicRequestBuilder
-                .get(new java.net.URI("https://www.marginalia.nu/log/"))
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .build());
+        client.fetch(httpClient, new DomainCookies(), request1);

-        client.fetch(httpClient, ClassicRequestBuilder
-                .get(new java.net.URI("https://www.marginalia.nu/sanic.png"))
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .build());
+        HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
+        request2.addHeader("User-agent", "test.marginalia.nu");
+        request2.addHeader("Accept-Encoding", "gzip");

-        CrawledDocumentParquetRecordFileWriter.convertWarc(
+        client.fetch(httpClient, new DomainCookies(), request2);
+
+        HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
+        request3.addHeader("User-agent", "test.marginalia.nu");
+        request3.addHeader("Accept-Encoding", "gzip");
+
+        client.fetch(httpClient, new DomainCookies(), request3);
+
+        HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
+        request4.addHeader("User-agent", "test.marginalia.nu");
+        request4.addHeader("Accept-Encoding", "gzip");
+
+        client.fetch(httpClient, new DomainCookies(), request4);
+
+        SlopCrawlDataRecord.convertWarc(
                "www.marginalia.nu",
                new UserAgent("test", "test"),
                fileNameWarc,
-                fileNameParquet);
+                fileNameSlop);

-        var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
-        assertEquals(2, urls.size());
+        List<String> urls;
+        try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
+            urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
+        }
+
+        assertEquals(3, urls.size());
        assertEquals("https://www.marginalia.nu/", urls.get(0));
        assertEquals("https://www.marginalia.nu/log/", urls.get(1));
        // sanic.jpg gets filtered out for its bad mime type
+        assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));

    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
@@ -1,6 +1,7 @@
 package nu.marginalia.crawling;

 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@@ -31,7 +32,7 @@ class HttpFetcherTest {
    void fetchUTF8() throws Exception {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
        try (var recorder = new WarcRecorder()) {
-            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
            if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
                System.out.println(bodyOk.contentType());
            }
@@ -49,7 +50,7 @@ class HttpFetcherTest {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");

        try (var recorder = new WarcRecorder()) {
-            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
            if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
                System.out.println(bodyOk.contentType());
            }
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@@ -3,10 +3,7 @@ package nu.marginalia.crawling.retreival;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
-import nu.marginalia.crawl.fetcher.ContentTags;
-import nu.marginalia.crawl.fetcher.HttpFetcher;
-import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.fetcher.SitemapRetriever;
+import nu.marginalia.crawl.fetcher.*;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
@@ -137,7 +134,7 @@ public class CrawlerMockFetcherTest {
        }

        @Override
-        public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
+        public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, DomainCookies cookies, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
            logger.info("Fetching {}", url);
            if (mockData.containsKey(url)) {
                byte[] bodyBytes = mockData.get(url).documentBodyBytes;
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -16,7 +16,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
 import nu.marginalia.slop.SlopCrawlDataRecord;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
 import org.jetbrains.annotations.NotNull;
 import org.junit.jupiter.api.*;
 import org.netpreserve.jwarc.*;
@@ -180,7 +179,7 @@ class CrawlerRetreiverTest {
                new EdgeDomain("www.marginalia.nu"),
                List.of(), 100);
        var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
-                new WarcRecorder(tempFileWarc2, new BasicCookieStore())
+                new WarcRecorder(tempFileWarc2)
        );

        // truncate the size of the file to simulate a crash
@@ -456,7 +455,7 @@ class CrawlerRetreiverTest {
                List.of(), 100);

        var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
-                new WarcRecorder(tempFileWarc3, new BasicCookieStore())
+                new WarcRecorder(tempFileWarc3)
        );

        // truncate the size of the file to simulate a crash
@@ -507,7 +506,7 @@ class CrawlerRetreiverTest {
    }

    private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
-        try (var recorder = new WarcRecorder(tempFileWarc2, new BasicCookieStore());
+        try (var recorder = new WarcRecorder(tempFileWarc2);
             var db = new DomainStateDb(tempFileDb)
        ) {
            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
@@ -519,7 +518,7 @@ class CrawlerRetreiverTest {

    @NotNull
    private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
-        try (var recorder = new WarcRecorder(tempFileWarc1, new BasicCookieStore());
+        try (var recorder = new WarcRecorder(tempFileWarc1);
             var db = new DomainStateDb(tempFileDb)
        ) {
            var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
--- a/code/services-application/search-service/resources/jte/serp/part/searchform.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/searchform.jte
@@ -13,7 +13,7 @@
                   class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
                   value="${query}"
                   autofocus
-                   placeholder="Search..."
+                   placeholder="Search the web!"
                   autocomplete="off"
                   name="query"
                   id="searchInput" />
@@ -21,7 +21,7 @@
            <input type="text"
                   class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
                   value="${query}"
-                   placeholder="Search..."
+                   placeholder="Search the web!"
                   autocomplete="off"
                   name="query"
                   id="searchInput" />
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantModule.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantModule.java
@@ -10,7 +10,7 @@ import static com.google.inject.name.Names.named;

 public class AssistantModule extends AbstractModule {
    public void configure() {
-        bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt"));
+        bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));

        bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
    }
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/PrefixSearchStructure.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/PrefixSearchStructure.java
@@ -0,0 +1,459 @@
+package nu.marginalia.assistant.suggest;
+
+import gnu.trove.list.array.TIntArrayList;
+
+import java.util.*;
+
+/** Unhinged data structure for fast prefix searching.
+ */
+public class PrefixSearchStructure {
+    // Core data structures
+    private final HashMap<String, TIntArrayList> prefixIndex;     // Short prefix index (up to 8 chars)
+    private final HashMap<String, TIntArrayList> longPrefixIndex; // Long prefix index (9-16 chars)
+    private final ArrayList<String> words;                        // All words by ID
+    private final TIntArrayList wordScores;                       // Scores for all words
+
+    // Configuration
+    private static final int SHORT_PREFIX_LENGTH = 8;
+    private static final int MAX_INDEXED_PREFIX_LENGTH = 16;
+
+    public int size() {
+        return words.size();
+    }
+
+    // For sorting efficiency
+    private static class WordScorePair {
+        final String word;
+        final int score;
+
+        WordScorePair(String word, int score) {
+            this.word = word;
+            this.score = score;
+        }
+    }
+
+    /**
+     * Creates a new PrefixTrie for typeahead search.
+     */
+    public PrefixSearchStructure() {
+        prefixIndex = new HashMap<>(1024);
+        longPrefixIndex = new HashMap<>(1024);
+        words = new ArrayList<>(1024);
+        wordScores = new TIntArrayList(1024);
+    }
+
+    /**
+     * Adds a prefix to the index.
+     */
+    private void indexPrefix(String word, int wordId) {
+        // Index short prefixes
+        for (int i = 1; i <= Math.min(word.length(), SHORT_PREFIX_LENGTH); i++) {
+            String prefix = word.substring(0, i);
+            TIntArrayList wordIds = prefixIndex.computeIfAbsent(
+                    prefix, k -> new TIntArrayList(16));
+            wordIds.add(wordId);
+        }
+
+        // Index longer prefixes
+        for (int i = SHORT_PREFIX_LENGTH + 1; i <= Math.min(word.length(), MAX_INDEXED_PREFIX_LENGTH); i++) {
+            String prefix = word.substring(0, i);
+            TIntArrayList wordIds = longPrefixIndex.computeIfAbsent(
+                    prefix, k -> new TIntArrayList(8));
+            wordIds.add(wordId);
+        }
+
+        // If the word contains spaces, also index by each term for multi-word queries
+        if (word.contains(" ")) {
+            String[] terms = word.split("\\s+");
+            for (String term : terms) {
+                if (term.length() >= 2) {
+                    for (int i = 1; i <= Math.min(term.length(), SHORT_PREFIX_LENGTH); i++) {
+                        String termPrefix = "t:" + term.substring(0, i);
+                        TIntArrayList wordIds = prefixIndex.computeIfAbsent(
+                                termPrefix, k -> new TIntArrayList(16));
+                        wordIds.add(wordId);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Inserts a word with its associated score.
+     */
+    public void insert(String word, int score) {
+        if (word == null || word.isEmpty()) {
+            return;
+        }
+
+        // Add to the word list and index
+        int wordId = words.size();
+        words.add(word);
+        wordScores.add(score);
+        indexPrefix(word, wordId);
+    }
+
+    /**
+     * Returns the top k completions for a given prefix.
+     */
+    public List<ScoredSuggestion> getTopCompletions(String prefix, int k) {
+        if (prefix == null || prefix.isEmpty()) {
+            // Return top k words by score
+            return getTopKWords(k);
+        }
+
+        // Check if this is a term search (t:) - for searching within multi-word items
+        boolean isTermSearch = false;
+        if (prefix.startsWith("t:") && prefix.length() > 2) {
+            isTermSearch = true;
+            prefix = prefix.substring(2);
+        }
+
+        // 1. Fast path for short prefixes
+        if (prefix.length() <= SHORT_PREFIX_LENGTH) {
+            String lookupPrefix = isTermSearch ? "t:" + prefix : prefix;
+            TIntArrayList wordIds = prefixIndex.get(lookupPrefix);
+            if (wordIds != null) {
+                return getTopKFromWordIds(wordIds, k);
+            }
+        }
+
+        // 2. Fast path for long prefixes (truncate to MAX_INDEXED_PREFIX_LENGTH)
+        if (prefix.length() > SHORT_PREFIX_LENGTH) {
+            // Try exact match in longPrefixIndex first
+            if (prefix.length() <= MAX_INDEXED_PREFIX_LENGTH) {
+                TIntArrayList wordIds = longPrefixIndex.get(prefix);
+                if (wordIds != null) {
+                    return getTopKFromWordIds(wordIds, k);
+                }
+            }
+
+            // If prefix is longer than MAX_INDEXED_PREFIX_LENGTH, truncate and filter
+            if (prefix.length() > MAX_INDEXED_PREFIX_LENGTH) {
+                String truncatedPrefix = prefix.substring(0, MAX_INDEXED_PREFIX_LENGTH);
+                TIntArrayList candidateIds = longPrefixIndex.get(truncatedPrefix);
+                if (candidateIds != null) {
+                    // Filter candidates by the full prefix
+                    return getFilteredTopKFromWordIds(candidateIds, prefix, k);
+                }
+            }
+        }
+
+        // 3. Optimized fallback for long prefixes - use prefix tree for segments
+        List<ScoredSuggestion> results = new ArrayList<>();
+
+        // Handle multi-segment queries by finding candidates from first 8 chars
+        if (prefix.length() > SHORT_PREFIX_LENGTH) {
+            String shortPrefix = prefix.substring(0, Math.min(prefix.length(), SHORT_PREFIX_LENGTH));
+            TIntArrayList candidates = prefixIndex.get(shortPrefix);
+
+            if (candidates != null) {
+                return getFilteredTopKFromWordIds(candidates, prefix, k);
+            }
+        }
+
+        // 4. Last resort - optimized binary search in sorted segments
+        return findByBinarySearchPrefix(prefix, k);
+    }
+
+    /**
+     * Helper to get the top k words by score.
+     */
+    private List<ScoredSuggestion> getTopKWords(int k) {
+        // Create pairs of (score, wordId)
+        int[][] pairs = new int[words.size()][2];
+        for (int i = 0; i < words.size(); i++) {
+            pairs[i][0] = wordScores.get(i);
+            pairs[i][1] = i;
+        }
+
+        // Sort by score (descending)
+        Arrays.sort(pairs, (a, b) -> Integer.compare(b[0], a[0]));
+
+        // Take top k
+        List<ScoredSuggestion> results = new ArrayList<>();
+        for (int i = 0; i < Math.min(k, pairs.length); i++) {
+            String word = words.get(pairs[i][1]);
+            int score = pairs[i][0];
+            results.add(new ScoredSuggestion(word, score));
+        }
+
+        return results;
+    }
+
+    /**
+     * Helper to get the top k words from a list of word IDs.
+     */
+    private List<ScoredSuggestion> getTopKFromWordIds(TIntArrayList wordIds, int k) {
+        if (wordIds == null || wordIds.isEmpty()) {
+            return Collections.emptyList();
+        }
+
+        // For small lists, avoid sorting
+        if (wordIds.size() <= k) {
+            List<ScoredSuggestion> results = new ArrayList<>(wordIds.size());
+            int[] ids = wordIds.toArray();
+            for (int wordId : ids) {
+                if (wordId >= 0 && wordId < words.size()) {
+                    results.add(new ScoredSuggestion(words.get(wordId), wordScores.get(wordId)));
+                }
+            }
+            results.sort((a, b) -> Integer.compare(b.getScore(), a.getScore()));
+            return results;
+        }
+
+        // For larger lists, use an array-based approach for better performance
+        // Find top k without full sorting
+        int[] topScores = new int[k];
+        int[] topWordIds = new int[k];
+        int[] ids = wordIds.toArray();
+
+        // Initialize with first k elements
+        int filledCount = Math.min(k, ids.length);
+        for (int i = 0; i < filledCount; i++) {
+            int wordId = ids[i];
+            if (wordId >= 0 && wordId < words.size()) {
+                topWordIds[i] = wordId;
+                topScores[i] = wordScores.get(wordId);
+            }
+        }
+
+        // Sort initial elements
+        for (int i = 0; i < filledCount; i++) {
+            for (int j = i + 1; j < filledCount; j++) {
+                if (topScores[j] > topScores[i]) {
+                    // Swap scores
+                    int tempScore = topScores[i];
+                    topScores[i] = topScores[j];
+                    topScores[j] = tempScore;
+
+                    // Swap word IDs
+                    int tempId = topWordIds[i];
+                    topWordIds[i] = topWordIds[j];
+                    topWordIds[j] = tempId;
+                }
+            }
+        }
+
+        // Process remaining elements
+        int minScore = filledCount > 0 ? topScores[filledCount - 1] : Integer.MIN_VALUE;
+
+        for (int i = k; i < ids.length; i++) {
+            int wordId = ids[i];
+            if (wordId >= 0 && wordId < words.size()) {
+                int score = wordScores.get(wordId);
+
+                if (score > minScore) {
+                    // Replace the lowest element
+                    topScores[filledCount - 1] = score;
+                    topWordIds[filledCount - 1] = wordId;
+
+                    // Bubble up the new element
+                    for (int j = filledCount - 1; j > 0; j--) {
+                        if (topScores[j] > topScores[j - 1]) {
+                            // Swap scores
+                            int tempScore = topScores[j];
+                            topScores[j] = topScores[j - 1];
+                            topScores[j - 1] = tempScore;
+
+                            // Swap word IDs
+                            int tempId = topWordIds[j];
+                            topWordIds[j] = topWordIds[j - 1];
+                            topWordIds[j - 1] = tempId;
+                        } else {
+                            break;
+                        }
+                    }
+
+                    // Update min score
+                    minScore = topScores[filledCount - 1];
+                }
+            }
+        }
+
+        // Create result list
+        List<ScoredSuggestion> results = new ArrayList<>(filledCount);
+        for (int i = 0; i < filledCount; i++) {
+            results.add(new ScoredSuggestion(words.get(topWordIds[i]), topScores[i]));
+        }
+
+        return results;
+    }
+
+    /**
+     * Use binary search on sorted word segments to efficiently find matches.
+     */
+    private List<ScoredSuggestion> findByBinarySearchPrefix(String prefix, int k) {
+        // If we have a lot of words, use an optimized segment approach
+        if (words.size() > 1000) {
+            // Divide words into segments for better locality
+            int segmentSize = 1000;
+            int numSegments = (words.size() + segmentSize - 1) / segmentSize;
+
+            // Find matches using binary search within each segment
+            List<WordScorePair> allMatches = new ArrayList<>();
+            for (int segment = 0; segment < numSegments; segment++) {
+                int start = segment * segmentSize;
+                int end = Math.min(start + segmentSize, words.size());
+
+                // Binary search for first potential match
+                int pos = Collections.binarySearch(
+                        words.subList(start, end),
+                        prefix,
+                        (a, b) -> a.compareTo(b)
+                );
+
+                if (pos < 0) {
+                    pos = -pos - 1;
+                }
+
+                // Collect all matches
+                for (int i = start + pos; i < end && i < words.size(); i++) {
+                    String word = words.get(i);
+                    if (word.startsWith(prefix)) {
+                        allMatches.add(new WordScorePair(word, wordScores.get(i)));
+                    } else if (word.compareTo(prefix) > 0) {
+                        break; // Past potential matches
+                    }
+                }
+            }
+
+            // Sort by score and take top k
+            allMatches.sort((a, b) -> Integer.compare(b.score, a.score));
+            List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, allMatches.size()));
+            for (int i = 0; i < Math.min(k, allMatches.size()); i++) {
+                WordScorePair pair = allMatches.get(i);
+                results.add(new ScoredSuggestion(pair.word, pair.score));
+            }
+            return results;
+        }
+
+        // Fallback for small dictionaries - linear scan but optimized
+        return simpleSearchFallback(prefix, k);
+    }
+
+    /**
+     * Optimized linear scan - only used for small dictionaries.
+     */
+    private List<ScoredSuggestion> simpleSearchFallback(String prefix, int k) {
+        // Use primitive arrays for better cache locality
+        int[] matchScores = new int[Math.min(words.size(), 100)]; // Assume we won't find more than 100 matches
+        String[] matchWords = new String[matchScores.length];
+        int matchCount = 0;
+
+        for (int i = 0; i < words.size() && matchCount < matchScores.length; i++) {
+            String word = words.get(i);
+            if (word.startsWith(prefix)) {
+                matchWords[matchCount] = word;
+                matchScores[matchCount] = wordScores.get(i);
+                matchCount++;
+            }
+        }
+
+        // Sort matches by score (in-place for small arrays)
+        for (int i = 0; i < matchCount; i++) {
+            for (int j = i + 1; j < matchCount; j++) {
+                if (matchScores[j] > matchScores[i]) {
+                    // Swap scores
+                    int tempScore = matchScores[i];
+                    matchScores[i] = matchScores[j];
+                    matchScores[j] = tempScore;
+
+                    // Swap words
+                    String tempWord = matchWords[i];
+                    matchWords[i] = matchWords[j];
+                    matchWords[j] = tempWord;
+                }
+            }
+        }
+
+        // Create results
+        List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
+        for (int i = 0; i < Math.min(k, matchCount); i++) {
+            results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
+        }
+
+        return results;
+    }
+
+    /**
+     * Get top k words from candidate IDs, filtering by the full prefix.
+     */
+    private List<ScoredSuggestion> getFilteredTopKFromWordIds(TIntArrayList wordIds, String fullPrefix, int k) {
+        if (wordIds == null || wordIds.isEmpty()) {
+            return Collections.emptyList();
+        }
+
+        // Make primitive arrays for better performance
+        String[] matchWords = new String[Math.min(wordIds.size(), 1000)];
+        int[] matchScores = new int[matchWords.length];
+        int matchCount = 0;
+
+        int[] ids = wordIds.toArray();
+        for (int i = 0; i < ids.length && matchCount < matchWords.length; i++) {
+            int wordId = ids[i];
+            if (wordId >= 0 && wordId < words.size()) {
+                String word = words.get(wordId);
+                if (word.startsWith(fullPrefix)) {
+                    matchWords[matchCount] = word;
+                    matchScores[matchCount] = wordScores.get(wordId);
+                    matchCount++;
+                }
+            }
+        }
+
+        // Sort by score (efficient insertion sort for small k)
+        for (int i = 0; i < Math.min(matchCount, k); i++) {
+            int maxPos = i;
+            for (int j = i + 1; j < matchCount; j++) {
+                if (matchScores[j] > matchScores[maxPos]) {
+                    maxPos = j;
+                }
+            }
+            if (maxPos != i) {
+                // Swap
+                int tempScore = matchScores[i];
+                matchScores[i] = matchScores[maxPos];
+                matchScores[maxPos] = tempScore;
+
+                String tempWord = matchWords[i];
+                matchWords[i] = matchWords[maxPos];
+                matchWords[maxPos] = tempWord;
+            }
+        }
+
+        // Create result list (only up to k elements)
+        List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
+        for (int i = 0; i < Math.min(k, matchCount); i++) {
+            results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
+        }
+
+        return results;
+    }
+
+    /**
+     * Class representing a suggested completion.
+     */
+    public static class ScoredSuggestion {
+        private final String word;
+        private final int score;
+
+        public ScoredSuggestion(String word, int score) {
+            this.word = word;
+            this.score = score;
+        }
+
+        public String getWord() {
+            return word;
+        }
+
+        public int getScore() {
+            return score;
+        }
+
+        @Override
+        public String toString() {
+            return word + " (" + score + ")";
+        }
+    }
+}
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/Suggestions.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/Suggestions.java
@@ -4,23 +4,24 @@ import com.google.inject.Inject;
 import com.google.inject.name.Named;
 import nu.marginalia.functions.math.dict.SpellChecker;
 import nu.marginalia.term_frequency_dict.TermFrequencyDict;
-import nu.marginalia.model.crawl.HtmlFeature;
-import org.apache.commons.collections4.trie.PatriciaTrie;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.*;
-import java.util.function.Supplier;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Scanner;
 import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;

 public class Suggestions {
-    private PatriciaTrie<String> suggestionsTrie = null;
+    private PrefixSearchStructure searchStructure = null;
    private TermFrequencyDict termFrequencyDict = null;
    private volatile boolean ready = false;
    private final SpellChecker spellChecker;
@@ -37,39 +38,40 @@ public class Suggestions {
        this.spellChecker = spellChecker;

        Thread.ofPlatform().start(() -> {
-            suggestionsTrie = loadSuggestions(suggestionsFile);
+            searchStructure = loadSuggestions(suggestionsFile);
            termFrequencyDict = dict;
            ready = true;
-            logger.info("Loaded {} suggestions", suggestionsTrie.size());
+            logger.info("Loaded {} suggestions", searchStructure.size());
        });
    }

-    private static PatriciaTrie<String> loadSuggestions(Path file) {
+    private static PrefixSearchStructure loadSuggestions(Path file) {
+        PrefixSearchStructure ret = new PrefixSearchStructure();
+
        if (!Files.exists(file)) {
            logger.error("Suggestions file {} absent, loading empty suggestions db", file);
-            return new PatriciaTrie<>();
+            return ret;
        }
-        try (var lines = Files.lines(file)) {
-            var ret = new PatriciaTrie<String>();

-            lines.filter(suggestionPattern.asPredicate())
-                    .filter(line -> line.length()<32)
-                    .map(String::toLowerCase)
-                    .forEach(w -> ret.put(w, w));
-
-            // Add special keywords to the suggestions
-            for (var feature : HtmlFeature.values()) {
-                String keyword = feature.getKeyword();
-
-                ret.put(keyword, keyword);
-                ret.put("-" + keyword, "-" + keyword);
+        try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
+            while (scanner.hasNextLine()) {
+                String line = scanner.nextLine();
+                String[] parts = StringUtils.split(line, " ", 2);
+                if (parts.length != 2) {
+                    logger.warn("Invalid suggestion line: {}", line);
+                    continue;
+                }
+                int cnt = Integer.parseInt(parts[0]);
+                if (cnt > 1) {
+                    String word = parts[1];
+                    ret.insert(word, cnt);
+                }
            }
-
            return ret;
        }
        catch (IOException ex) {
            logger.error("Failed to load suggestions file", ex);
-            return new PatriciaTrie<>();
+            return new PrefixSearchStructure();
        }
    }

@@ -83,96 +85,24 @@ public class Suggestions {

        searchWord = StringUtils.stripStart(searchWord.toLowerCase(), " ");

-        return Stream.of(
-                    new SuggestionStream("", getSuggestionsForKeyword(count, searchWord)),
-                    suggestionsForLastWord(count, searchWord),
-                    spellCheckStream(searchWord)
-                )
-                .flatMap(SuggestionsStreamable::stream)
-                .limit(count)
-                .collect(Collectors.toList());
+        return getSuggestionsForKeyword(count, searchWord);
    }

-    private SuggestionsStreamable suggestionsForLastWord(int count, String searchWord) {
-        int sp = searchWord.lastIndexOf(' ');
-
-        if (sp < 0) {
-            return Stream::empty;
-        }
-
-        String prefixString = searchWord.substring(0, sp+1);
-        String suggestString = searchWord.substring(sp+1);
-
-        return new SuggestionStream(prefixString, getSuggestionsForKeyword(count, suggestString));
-
-    }
-
-    private SuggestionsStreamable spellCheckStream(String word) {
-        int start = word.lastIndexOf(' ');
-        String prefix;
-        String corrWord;
-
-        if (start < 0) {
-            corrWord = word;
-            prefix = "";
-        }
-        else {
-            prefix = word.substring(0, start + 1);
-            corrWord = word.substring(start + 1);
-        }
-
-        if (corrWord.length() >= MIN_SUGGEST_LENGTH) {
-            Supplier<Stream<String>> suggestionsLazyEval = () -> spellChecker.correct(corrWord).stream();
-            return new SuggestionStream(prefix, Stream.of(suggestionsLazyEval).flatMap(Supplier::get));
-        }
-        else {
-            return Stream::empty;
-        }
-    }
-
-
-    public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
+    public List<String> getSuggestionsForKeyword(int count, String prefix) {
        if (!ready)
-            return Stream.empty();
+            return List.of();

        if (prefix.length() < MIN_SUGGEST_LENGTH) {
-            return Stream.empty();
+            return List.of();
        }

-        var start = suggestionsTrie.select(prefix);
-
-        if (start == null) {
-            return Stream.empty();
+        var results = searchStructure.getTopCompletions(prefix, count);
+        List<String> ret = new ArrayList<>(count);
+        for (var result : results) {
+            ret.add(result.getWord());
        }

-        if (!start.getKey().startsWith(prefix)) {
-            return Stream.empty();
-        }
-
-        SuggestionsValueCalculator sv = new SuggestionsValueCalculator();
-
-        return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
-                .takeWhile(s -> s.startsWith(prefix))
-                .limit(256)
-                .sorted(Comparator.comparing(sv::get).thenComparing(String::length).thenComparing(Comparator.naturalOrder()))
-                .limit(count);
+        return ret;
    }

-    private record SuggestionStream(String prefix, Stream<String> suggestionStream) implements SuggestionsStreamable {
-        public Stream<String> stream() {
-            return suggestionStream.map(s -> prefix + s);
-        }
-    }
-
-    interface SuggestionsStreamable { Stream<String> stream(); }
-
-    private class SuggestionsValueCalculator {
-
-        private final Map<String, Long> hashCache = new HashMap<>(512);
-
-        public int get(String s) {
-            long hash = hashCache.computeIfAbsent(s, TermFrequencyDict::getStringHash);
-            return -termFrequencyDict.getTermFreqHash(hash);
-        }
-    }
 }
--- a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java
@@ -59,9 +59,9 @@ public class ControlMain extends MainClass {
            download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt"));
        }

-        Path suggestionsFile = dataPath.resolve("suggestions.txt");
+        Path suggestionsFile = dataPath.resolve("suggestions2.txt.gz");
        if (!Files.exists(suggestionsFile)) {
-            downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz"));
+            download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
        }

        Path asnRawData = dataPath.resolve("asn-data-raw-table");
--- a/code/services-core/control-service/resources/templates/control/node/actions/partial-download-sample-data.hdb
+++ b/code/services-core/control-service/resources/templates/control/node/actions/partial-download-sample-data.hdb
@@ -24,25 +24,25 @@ This is a sample of real crawl data.  It is intended for demo, testing and devel
    <tr>
        <td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
        <td><label for="sample-s">Small</label></td>
-        <td>1000 Domains. About 2 GB. </td>
+        <td>1000 Domains. About 1 GB. </td>
    </tr>

    <tr>
        <td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
        <td><label for="sample-m">Medium</label></td>
-        <td>2000 Domains. About 6 GB. Recommended.</td>
+        <td>2000 Domains. About 2 GB. Recommended.</td>
    </tr>

    <tr>
        <td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
        <td><label for="sample-l">Large</label></td>
-        <td>5000 Domains.  About 20 GB.</td>
+        <td>5000 Domains.  About 7 GB.</td>
    </tr>

    <tr>
        <td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
        <td><label for="sample-xl">Huge</label></td>
-        <td>50,000 Domains.  Around 180 GB.  Primarily intended for pre-production like testing environments.
+        <td>50,000 Domains.  Around 80 GB.  Primarily intended for pre-production like testing environments.
            Expect hours of processing time. </td>
    </tr>
 </table>
--- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java
+++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java
@@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.converting.writer.ConverterBatchWriter;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.functions.searchquery.QueryFactory;
@@ -43,7 +44,6 @@ import nu.marginalia.process.control.FakeProcessHeartbeat;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.test.IntegrationTestModule;
 import nu.marginalia.test.TestUtil;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -121,11 +121,12 @@ public class IntegrationTest {
    public void run() throws Exception {

        /** CREATE WARC */
-        try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new BasicCookieStore())) {
+        try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
            warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
                    new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));

            warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
+                    new DomainCookies(),
                    "text/html", 200,
                    """
                            <html>
Author	SHA1	Message	Date
Viktor	f0c9b935d8	Merge pull request #192 from MarginaliaSearch/improve-suggestions Improve typeahead suggestions	2025-04-23 20:17:49 +02:00
Viktor Lofgren	7b5493dd51	(assistant) Improve typeahead suggestions Implement a new prefix search structure (not a trie, but hash table based) with a concept of score.	2025-04-23 20:13:53 +02:00
Viktor Lofgren	c246a59158	(search) Make it clearer that it's a search engine	2025-04-22 16:03:42 +02:00
Viktor	0b99781d24	Merge pull request #191 from MarginaliaSearch/pdf-support-in-crawler Pdf support in crawler	2025-04-22 15:52:41 +02:00
Viktor Lofgren	39db9620c1	(crawler) Increase maximum permitted file size to 32 MB	2025-04-22 15:51:03 +02:00
Viktor Lofgren	1781599363	(crawler) Add support for crawling PDF files	2025-04-22 15:50:05 +02:00
Viktor Lofgren	6b2d18fb9b	(crawler) Adjust domain limits to be generally more permissive.	2025-04-22 15:27:57 +02:00
Viktor	59b1d200ab	Merge pull request #190 from MarginaliaSearch/download-sample-chores Download sample chores	2025-04-22 13:29:49 +02:00
Viktor Lofgren	897010a2cf	(control) Update download sample data actor with better UI The original implementation didn't really give a lot of feedback about what it was doing. Adding a progress bar to the download step. Relates to issue 189.	2025-04-22 13:27:22 +02:00
Viktor Lofgren	602af7a77e	(control) Update UI with new sample sizes Relates to issue 189.	2025-04-22 13:27:13 +02:00
Viktor Lofgren	a7d91c8527	(crawler) Clean up fetcher detailed logging	2025-04-21 12:53:52 +02:00
Viktor Lofgren	7151602124	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Cleaning up after changes.	2025-04-21 12:47:03 +02:00
Viktor Lofgren	884e33bd4a	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change back to an unbounded queue, tighten sleep times a bit.	2025-04-21 11:48:15 +02:00
Viktor Lofgren	e84d5c497a	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.	2025-04-21 00:39:26 +02:00
Viktor Lofgren	2d2d3e2466	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.	2025-04-21 00:36:48 +02:00
Viktor Lofgren	647dd9b12f	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready	2025-04-21 00:24:30 +02:00
Viktor Lofgren	de4e2849ce	(crawler) Tweak request retry counts Increase the default number of tries to 3, but don't retry on SSL errors as they are unlikely to fix themselves in the short term.	2025-04-19 00:19:48 +02:00
Viktor Lofgren	3c43f1954e	(crawler) Add custom cookie store implementation Apache HttpClient's cookie implementation builds an enormous concurrent hashmap with every cookie for every domain ever crawled. This is a big waste of resources. Replacing it with a fairly crude domain-isolated instance, as we are primarily interested in answering whether a cookie is set, and we will never retain cookies long term.	2025-04-18 13:04:22 +02:00
Viktor Lofgren	fa2462ec39	(crawler) Re-enable aborts on timeout	2025-04-18 12:59:34 +02:00
Viktor Lofgren	f4ad7145db	(crawler) Disable SO_LINGER	2025-04-18 01:42:02 +02:00
Viktor Lofgren	068b450180	(crawler) Temporarily disable request.abort()	2025-04-18 01:25:56 +02:00
Viktor Lofgren	05b909a21f	(crawler) Add logging to get more info about connection leak	2025-04-18 01:06:52 +02:00
Viktor Lofgren	3d179cddce	(crawler) Correctly consume entity in sitemap retrieval	2025-04-18 00:32:21 +02:00
Viktor Lofgren	1a2aae496a	(crawler) Correct handling and abortion of HttpClient's requests There was a resource leak in the initial implementation of the Apache HttpClient WarcInputBuffer that failed to free up resources. Using HttpGet objects instead of the Classic...Request objects, as the latter fail to expose an abort()-method.	2025-04-18 00:16:26 +02:00
Viktor Lofgren	353cdffb3f	(crawler) Increase connection request timeout, restore congestion timeout	2025-04-17 21:32:06 +02:00
Viktor Lofgren	2e3f1313c7	(crawler) Log exceptions while crawling in crawler audit log	2025-04-17 21:18:09 +02:00
Viktor Lofgren	58e6f141ce	(crawler) Reduce congestion throttle go-rate	2025-04-17 20:36:58 +02:00