(search) Remove unused count modifier from the footer help

(search) Add a note for TUI users pointing them to the old UI
(deploy) assistant
2025-10-06 07:32:38 +02:00 · 2025-04-27 12:08:34 +02:00 · 2025-04-27 11:52:07 +02:00 · 2025-04-25 13:25:50 +02:00 · 2025-04-25 13:19:07 +02:00 · 2025-04-24 13:35:29 +02:00
24 changed files with 815 additions and 240 deletions
--- a/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
 import nu.marginalia.actor.state.ActorStep;
 import nu.marginalia.actor.state.Resume;
 import nu.marginalia.service.control.ServiceEventLog;
 import nu.marginalia.service.control.ServiceHeartbeat;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageId;
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.*;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
    private final FileStorageService storageService;
    private final ServiceEventLog eventLog;
    private final ServiceHeartbeat heartbeat;
    private final Logger logger = LoggerFactory.getLogger(getClass());
    @Resume(behavior = ActorResumeBehavior.ERROR)
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
                Files.deleteIfExists(Path.of(tarFileName));
-                try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
+                HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
-                     var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
+
-                    is.transferTo(os);
+                try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
                    long size = urlConnection.getContentLengthLong();
                    byte[] buffer = new byte[8192];
                    try (var is = new BufferedInputStream(urlConnection.getInputStream());
                         var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
                        long copiedSize = 0;
                        while (copiedSize < size) {
                            int read = is.read(buffer);
                            if (read < 0) // We've been promised a file of length 'size'
                                throw new IOException("Unexpected end of stream");
                            os.write(buffer, 0, read);
                            copiedSize += read;
                            // Update progress bar
                            hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
                        }
                    }
                }
                catch (Exception ex) {
                    eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
                    logger.error("Error downloading sample", ex);
                    yield new Error();
                }
                finally {
                    urlConnection.disconnect();
                }
                eventLog.logEvent(DownloadSampleActor.class, "Download complete");
                yield new Extract(fileStorageId, tarFileName);
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
    @Inject
    public DownloadSampleActor(Gson gson,
                               FileStorageService storageService,
-                               ServiceEventLog eventLog)
+                               ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
    {
        super(gson);
        this.storageService = storageService;
        this.eventLog = eventLog;
        this.heartbeat = heartbeat;
    }
 }
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@@ -229,13 +229,15 @@ public class FeedFetcherService {
                    .timeout(Duration.ofSeconds(15))
                    ;
-            if (ifModifiedSinceDate != null) {
+            // Set the If-Modified-Since or If-None-Match headers if we have them
            // though since there are certain idiosyncrasies in server implementations,
            // we avoid setting both at the same time as that may turn a 304 into a 200.
            if (ifNoneMatchTag != null) {
                requestBuilder.header("If-None-Match", ifNoneMatchTag);
            } else if (ifModifiedSinceDate != null) {
                requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
            }
            if (ifNoneMatchTag != null) {
                requestBuilder.header("If-None-Match", ifNoneMatchTag);
            }
            HttpRequest getRequest = requestBuilder.build();
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
 import java.security.Security;
 import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {
    private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
    private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
    private final AtomicInteger tasksDone = new AtomicInteger(0);
    private final HttpFetcherImpl fetcher;
@@ -277,12 +280,29 @@ public class CrawlerMain extends ProcessMainClass {
            }
             // Schedule viable tasks for execution until list is empty
-            while (!taskList.isEmpty()) {
+            for (int emptyRuns = 0;emptyRuns < 300;) {
-                taskList.removeIf(this::trySubmitDeferredTask);
+                boolean hasTasks = !taskList.isEmpty();
-                // Add a small pause here to avoid busy looping toward the end of the execution cycle when
+                // The order of these checks  very important to avoid a race condition
-                // we might have no new viable tasks to run for hours on end
+                // where we miss a task that is put into the retry queue
-                TimeUnit.MILLISECONDS.sleep(50);
+                boolean hasRunningTasks = pool.getActiveCount() > 0;
                boolean hasRetryTasks = !retryQueue.isEmpty();
                if (hasTasks || hasRetryTasks || hasRunningTasks) {
                    retryQueue.drainTo(taskList);
                    // Try to submit any tasks that are in the retry queue (this will block if the pool is full)
                    taskList.removeIf(this::trySubmitDeferredTask);
                    // Add a small pause here to avoid busy looping toward the end of the execution cycle when
                    // we might have no new viable tasks to run for hours on end
                    TimeUnit.MILLISECONDS.sleep(5);
                } else {
                    // We have no tasks to run, and no tasks in the retry queue
                    // but we wait a bit to see if any new tasks come in via the retry queue
                    emptyRuns++;
                    TimeUnit.SECONDS.sleep(1);
                }
            }
            logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -414,7 +434,7 @@ public class CrawlerMain extends ProcessMainClass {
        /** Best effort indicator whether we could start this now without getting stuck in
         * DomainLocks purgatory */
        public boolean canRun() {
-            return domainLocks.canLock(new EdgeDomain(domain));
+            return domainLocks.isLockableHint(new EdgeDomain(domain));
        }
        @Override
@@ -425,66 +445,82 @@ public class CrawlerMain extends ProcessMainClass {
                return;
            }
-            Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
+            Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
-            Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
+            // We don't have a lock, so we can't run this task
-            Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
+            // we return to avoid blocking the pool for too long
-
+            if (lock.isEmpty()) {
-            // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
+                if (retryQueue.remainingCapacity() > 0) {
-            // while writing to the same file name as before
+                    // Sleep a moment to avoid busy looping via the retry queue
-            if (Files.exists(newWarcFile)) {
+                    // in the case when few tasks remain and almost all are ineligible for
-                Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
+                    // immediate restart
-            }
+                    Thread.sleep(5);
            else {
                Files.deleteIfExists(tempFile);
            }
            try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
                 var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
                 CrawlDataReference reference = getReference()
            )
            {
                // Resume the crawl if it was aborted
                if (Files.exists(tempFile)) {
                    retriever.syncAbortedRun(tempFile);
                    Files.delete(tempFile);
                }
-                DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
+                retryQueue.put(this);
                return;
            }
            DomainLocks.DomainLock domainLock = lock.get();
-                int size;
+            try (domainLock) {
-                try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
+                Thread.currentThread().setName("crawling:" + domain);
-                    size = retriever.crawlDomain(domainLinks, reference);
+
                Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
                Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
                Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
                // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
                // while writing to the same file name as before
                if (Files.exists(newWarcFile)) {
                    Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
                }
                else {
                    Files.deleteIfExists(tempFile);
                }
-                // Delete the reference crawl data if it's not the same as the new one
+                try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
-                // (mostly a case when migrating from legacy->warc)
+                     var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
-                reference.delete();
+                     CrawlDataReference reference = getReference())
                {
                    // Resume the crawl if it was aborted
                    if (Files.exists(tempFile)) {
                        retriever.syncAbortedRun(tempFile);
                        Files.delete(tempFile);
                    }
-                // Convert the WARC file to Parquet
+                    DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
                SlopCrawlDataRecord
                        .convertWarc(domain, userAgent, newWarcFile, slopFile);
-                // Optionally archive the WARC file if full retention is enabled,
+                    int size = retriever.crawlDomain(domainLinks, reference);
                // otherwise delete it:
                warcArchiver.consumeWarc(newWarcFile, domain);
-                // Mark the domain as finished in the work log
+                    // Delete the reference crawl data if it's not the same as the new one
-                workLog.setJobToFinished(domain, slopFile.toString(), size);
+                    // (mostly a case when migrating from legacy->warc)
                    reference.delete();
-                // Update the progress bar
+                    // Convert the WARC file to Slop
-                heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
+                    SlopCrawlDataRecord
                            .convertWarc(domain, userAgent, newWarcFile, slopFile);
-                logger.info("Fetched {}", domain);
+                    // Optionally archive the WARC file if full retention is enabled,
-            } catch (Exception e) {
+                    // otherwise delete it:
-                logger.error("Error fetching domain " + domain, e);
+                    warcArchiver.consumeWarc(newWarcFile, domain);
            }
            finally {
                // We don't need to double-count these; it's also kept in the workLog
                pendingCrawlTasks.remove(domain);
                Thread.currentThread().setName("[idle]");
-                Files.deleteIfExists(newWarcFile);
+                    // Mark the domain as finished in the work log
-                Files.deleteIfExists(tempFile);
+                    workLog.setJobToFinished(domain, slopFile.toString(), size);
                    // Update the progress bar
                    heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
                    logger.info("Fetched {}", domain);
                } catch (Exception e) {
                    logger.error("Error fetching domain " + domain, e);
                }
                finally {
                    // We don't need to double-count these; it's also kept in the workLog
                    pendingCrawlTasks.remove(domain);
                    Thread.currentThread().setName("[idle]");
                    Files.deleteIfExists(newWarcFile);
                    Files.deleteIfExists(tempFile);
                }
            }
        }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
@@ -19,11 +19,13 @@ public record ContentTags(String etag, String lastMod) {
    /** Paints the tags onto the request builder. */
    public void paint(HttpGet request) {
        // Paint the ETag header if present,
        // otherwise paint the Last-Modified header
        // (but not both at the same time due to some servers not liking it)
        if (etag != null) {
            request.addHeader("If-None-Match", etag);
-        }
+        } else if (lastMod != null) {
        if (lastMod != null) {
            request.addHeader("If-Modified-Since", lastMod);
        }
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -53,6 +53,7 @@ import java.net.SocketTimeoutException;
 import java.net.URISyntaxException;
 import java.security.NoSuchAlgorithmException;
 import java.time.Duration;
 import java.time.Instant;
 import java.util.*;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
@@ -393,25 +394,31 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
            if (probeType == HttpFetcher.ProbeType.FULL) {
                try {
                    var probeResult = probeContentType(url, cookies, timer, contentTags);
-                    logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
+
                    switch (probeResult) {
                        case HttpFetcher.ContentTypeProbeResult.NoOp():
                            break; //
                        case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
                            logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
                            url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
                            break;
                        case ContentTypeProbeResult.BadContentType badContentType:
                            warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
                            logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
                            return new HttpFetchResult.ResultNone();
                        case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
                            logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
                            warcRecorder.flagAsTimeout(url);
                            return new HttpFetchResult.ResultException(ex);
                        case ContentTypeProbeResult.Exception(Exception ex):
                            logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
                            warcRecorder.flagAsError(url, ex);
                            return new HttpFetchResult.ResultException(ex);
                        case ContentTypeProbeResult.HttpError httpError:
                            logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
                            return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
                        case ContentTypeProbeResult.Redirect redirect:
                            logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
                            return new HttpFetchResult.ResultRedirect(redirect.location());
                    }
                } catch (Exception ex) {
@@ -430,27 +437,32 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
            contentTags.paint(request);
            try (var sl = new SendLock()) {
                Instant start = Instant.now();
                HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
                Duration fetchDuration = Duration.between(start, Instant.now());
                if (result instanceof HttpFetchResult.ResultOk ok) {
                    if (ok.statusCode() == 304) {
-                        return new HttpFetchResult.Result304Raw();
+                        result = new HttpFetchResult.Result304Raw();
                    }
                }
                switch (result) {
-                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
+                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
                    case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {}  for {}", redirect.url(), url);
-                    case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none  for {}", url);
+                    case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
-                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex());
+                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
                    case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
                    case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
                }
                return result;
            }
        }
        catch (Exception ex) {
-            ex.printStackTrace();
+            logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
            return new HttpFetchResult.ResultException(ex);
        }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@@ -41,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
    static final int MAX_TIME = 30_000;
    /** Maximum (decompressed) size we'll save */
-    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
+    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
    private final WarcWriter writer;
    private final Path warcFile;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
 import nu.marginalia.model.EdgeDomain;
 import java.util.Map;
 import java.util.Optional;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Semaphore;
@@ -19,8 +20,22 @@ public class DomainLocks {
     * and may be held by another thread.  The caller is responsible for locking and  releasing the lock.
     */
    public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
-        return new DomainLock(domain.toString(),
+        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
-                locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
+
        sem.acquire();
        return new DomainLock(sem);
    }
    public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
        if (sem.tryAcquire(1)) {
            return Optional.of(new DomainLock(sem));
        }
        else {
            // We don't have a lock, so we return an empty optional
            return Optional.empty();
        }
    }
    private Semaphore defaultPermits(String topDomain) {
@@ -28,23 +43,27 @@ public class DomainLocks {
            return new Semaphore(16);
        if (topDomain.equals("blogspot.com"))
            return new Semaphore(8);
-
+        if (topDomain.equals("tumblr.com"))
            return new Semaphore(8);
        if (topDomain.equals("neocities.org"))
-            return new Semaphore(4);
+            return new Semaphore(8);
        if (topDomain.equals("github.io"))
-            return new Semaphore(4);
+            return new Semaphore(8);
        // Substack really dislikes broad-scale crawlers, so we need to be careful
        // to not get blocked.
        if (topDomain.equals("substack.com")) {
            return new Semaphore(1);
        }
        if (topDomain.endsWith(".edu")) {
            return new Semaphore(1);
        }
        return new Semaphore(2);
    }
-    public boolean canLock(EdgeDomain domain) {
+    /** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
     * (this is just a hint, and does not guarantee that the domain is actually lockable any time
     * after this method returns true)
     */
    public boolean isLockableHint(EdgeDomain domain) {
        Semaphore sem = locks.get(domain.topDomain.toLowerCase());
        if (null == sem)
            return true;
@@ -53,22 +72,16 @@ public class DomainLocks {
    }
    public static class DomainLock implements AutoCloseable {
        private final String domainName;
        private final Semaphore semaphore;
-        DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
+        DomainLock(Semaphore semaphore) {
            this.domainName = domainName;
            this.semaphore = semaphore;
            Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
            semaphore.acquire();
            Thread.currentThread().setName("crawling:" + domainName);
        }
        @Override
        public void close() throws Exception {
            semaphore.release();
-            Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
+            Thread.currentThread().setName("[idle]");
        }
    }
 }
--- a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -6,6 +6,7 @@ public class ContentTypes {
    public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
            "application/xhtml",
            "text/html",
            "application/pdf",
            "image/x-icon",
            "text/plain");
@@ -19,4 +20,9 @@ public class ContentTypes {
        return false;
    }
    public static boolean isBinary(String contentTypeHeader) {
        String lcHeader = contentTypeHeader.toLowerCase();
        return lcHeader.startsWith("application/pdf");
    }
 }
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
            public boolean filter(String url, int status, String contentType) {
                String ctLc = contentType.toLowerCase();
                // Permit all plain text content types
                if (ctLc.startsWith("text/"))
                    return true;
                // PDF
                else if (ctLc.startsWith("application/pdf"))
                    return true;
                else if (ctLc.startsWith("x-marginalia/"))
                    return true;
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
 public class ContentTypeLogic {
-    private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
+    private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
    private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
    private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
    private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
            "application/rss+xml",
            "application/x-rss+xml",
            "application/rdf+xml",
            "application/pdf",
            "x-rss+xml"
    );
    private boolean allowAllContentTypes = false;
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
    public boolean isUrlLikeBinary(EdgeUrl url) {
        String pathLowerCase = url.path.toLowerCase();
-        if (probableHtmlPattern.test(pathLowerCase))
+        if (probableGoodPattern.test(pathLowerCase))
            return false;
        return probableBinaryPattern.test(pathLowerCase);
--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
            return false;
        }
        // If the format is binary, we don't want to translate it if the response is truncated
        if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
            return false;
        }
        return true;
    }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
@@ -40,6 +40,8 @@ class HttpFetcherImplFetchTest {
    private static EdgeUrl badHttpStatusUrl;
    private static EdgeUrl keepAliveUrl;
    private static EdgeUrl pdfUrl;
    @BeforeAll
    public static void setupAll() throws URISyntaxException {
        wireMockServer =
@@ -133,6 +135,13 @@ class HttpFetcherImplFetchTest {
                        ));
        pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
                .willReturn(WireMock.aResponse()
                        .withHeader("Content-Type", "application/pdf")
                        .withStatus(200)
                        .withBody("Hello World")));
        wireMockServer.start();
    }
@@ -352,6 +361,14 @@ class HttpFetcherImplFetchTest {
        Assertions.assertTrue(result.isOk());
    }
    @Test
    public void testPdf() {
        var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
    }
    private List<WarcRecord> getWarcRecords() throws IOException {
        List<WarcRecord> records = new ArrayList<>();
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@@ -4,9 +4,9 @@ import nu.marginalia.UserAgent;
 import nu.marginalia.crawl.fetcher.ContentTags;
 import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
+import nu.marginalia.slop.SlopCrawlDataRecord;
 import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
 import org.apache.hc.client5.http.classic.HttpClient;
 import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
@@ -24,13 +24,14 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.NoSuchAlgorithmException;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 class WarcRecorderTest {
    Path fileNameWarc;
-    Path fileNameParquet;
+    Path fileNameSlop;
    WarcRecorder client;
    HttpClient httpClient;
@@ -39,7 +40,7 @@ class WarcRecorderTest {
        httpClient = HttpClients.createDefault();
        fileNameWarc = Files.createTempFile("test", ".warc");
-        fileNameParquet = Files.createTempFile("test", ".parquet");
+        fileNameSlop = Files.createTempFile("test", ".slop.zip");
        client = new WarcRecorder(fileNameWarc);
    }
@@ -159,17 +160,28 @@ class WarcRecorderTest {
        client.fetch(httpClient, new DomainCookies(), request3);
-        CrawledDocumentParquetRecordFileWriter.convertWarc(
+        HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
        request4.addHeader("User-agent", "test.marginalia.nu");
        request4.addHeader("Accept-Encoding", "gzip");
        client.fetch(httpClient, new DomainCookies(), request4);
        SlopCrawlDataRecord.convertWarc(
                "www.marginalia.nu",
                new UserAgent("test", "test"),
                fileNameWarc,
-                fileNameParquet);
+                fileNameSlop);
-        var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
+        List<String> urls;
-        assertEquals(2, urls.size());
+        try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
            urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
        }
        assertEquals(3, urls.size());
        assertEquals("https://www.marginalia.nu/", urls.get(0));
        assertEquals("https://www.marginalia.nu/log/", urls.get(1));
        // sanic.jpg gets filtered out for its bad mime type
        assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));
    }
--- a/code/services-application/search-service/resources/jte/part/head.jte
+++ b/code/services-application/search-service/resources/jte/part/head.jte
@@ -26,4 +26,10 @@
    <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
-</head>
+</head>
 <noscript>
    <h1>Users of text-based browsers</h1>
    <p>Consider using the old interface at <a href="https://old-search.marginalia.nu/">https://old-search.marginalia.nu/</a>,
    as it uses fewer modern CSS tricks, and should work better than the new UI.  It's functionally nearly identical, but just renders it using a different layout.</p>
    <hr>
 </noscript>
--- a/code/services-application/search-service/resources/jte/part/warmup.jte
+++ b/code/services-application/search-service/resources/jte/part/warmup.jte
@@ -1,9 +1,16 @@
 This is a bit of a hack!
 This class exists to let tailwind we're using these classes even though they aren't visible in the code,
-as we sometimes generate classes from Java code!
+as we sometimes generate classes from Java code or javascript!
 <i class="text-blue-800 bg-blue-50 dark:text-blue-200 dark:bg-blue-950"></i>
 <i class="text-green-800 bg-green-50 dark:text-green-200 dark:bg-green-950"></i>
 <i class="text-purple-800 bg-purple-50 dark:text-purple-200 dark:bg-purple-950"></i>
 <i class="text-blue-950 bg-gray-100 dark:text-blue-50 dark:bg-gray-900"></i>
 <span class="hover:bg-gray-300 "></span>
 <label class="suggestion group block relative">
    <input type="radio" name="suggestion" class="peer hidden" checked>
    <div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full">
    </div>
 </label>
--- a/code/services-application/search-service/resources/jte/serp/part/footerHowto.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/footerHowto.jte
@@ -80,10 +80,6 @@
            <tr><td>rank&gt;50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
            <tr><td>rank&lt;50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
            <tr><td>count&gt;10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
            <tr><td>count&lt;10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
            <tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
            <tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
            <tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>
--- a/code/services-application/search-service/resources/jte/serp/part/searchform.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/searchform.jte
@@ -13,7 +13,7 @@
                   class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
                   value="${query}"
                   autofocus
-                   placeholder="Search..."
+                   placeholder="Search the web!"
                   autocomplete="off"
                   name="query"
                   id="searchInput" />
@@ -21,13 +21,13 @@
            <input type="text"
                   class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
                   value="${query}"
-                   placeholder="Search..."
+                   placeholder="Search the web!"
                   autocomplete="off"
                   name="query"
                   id="searchInput" />
        @endif
-        <div id="searchSuggestions" class="text-sm absolute top-2 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-200 rounded-lg shadow-lg hidden"></div>
+        <div aria-hidden="true" id="searchSuggestions" class="text-sm absolute top-3 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-300 rounded-lg shadow-lg hidden"></div>
        <button class="px-4 py-2 bg-margeblue text-white ml-2 rounded whitespace-nowrap active:text-slate-200">
            <i class="fas fa-search text-sm sm:mr-3"></i>
--- a/code/services-application/search-service/resources/static/js/typeahead.js
+++ b/code/services-application/search-service/resources/static/js/typeahead.js
@@ -43,13 +43,13 @@ function displaySuggestions(suggestions) {
    }
    suggestionsContainer.innerHTML = suggestions.map((suggestion, index) => `
-                <div 
+        <label class="suggestion group block relative">
-                    class="suggestion px-4 py-2 cursor-pointer hover:bg-gray-100 ${index === selectedIndex ? 'bg-blue-50' : ''}"
+            <input type="radio" name="suggestion" class="peer hidden" ${index === selectedIndex ? 'checked' : ''}>
-                    data-index="${index}"
+            <div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full" data-index="${index}">
-                >
+                ${suggestion}
-                    ${suggestion}
+            </div>
-                </div>
+        </label>
-            `).join('');
+    `).join('');
    suggestionsContainer.classList.remove('hidden');
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantModule.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantModule.java
@@ -10,7 +10,8 @@ import static com.google.inject.name.Names.named;
 public class AssistantModule extends AbstractModule {
    public void configure() {
-        bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt"));
+        bind(Path.class).annotatedWith(named("suggestions-file1")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
        bind(Path.class).annotatedWith(named("suggestions-file2")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions3.txt.gz"));
        bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
    }
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/PrefixSearchStructure.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/PrefixSearchStructure.java
@@ -0,0 +1,465 @@
 package nu.marginalia.assistant.suggest;
 import gnu.trove.list.array.TIntArrayList;
 import org.jetbrains.annotations.NotNull;
 import java.util.*;
 /** Unhinged data structure for fast prefix searching.
 */
 public class PrefixSearchStructure {
    // Core data structures
    private final HashMap<String, TIntArrayList> prefixIndex;     // Short prefix index (up to 8 chars)
    private final HashMap<String, TIntArrayList> longPrefixIndex; // Long prefix index (9-16 chars)
    private final ArrayList<String> words;                        // All words by ID
    private final TIntArrayList wordScores;                       // Scores for all words
    // Configuration
    private static final int SHORT_PREFIX_LENGTH = 8;
    private static final int MAX_INDEXED_PREFIX_LENGTH = 16;
    public int size() {
        return words.size();
    }
    // For sorting efficiency
    private static class WordScorePair {
        final String word;
        final int score;
        WordScorePair(String word, int score) {
            this.word = word;
            this.score = score;
        }
    }
    /**
     * Creates a new PrefixTrie for typeahead search.
     */
    public PrefixSearchStructure() {
        prefixIndex = new HashMap<>(1024);
        longPrefixIndex = new HashMap<>(1024);
        words = new ArrayList<>(1024);
        wordScores = new TIntArrayList(1024);
    }
    /**
     * Adds a prefix to the index.
     */
    private void indexPrefix(String word, int wordId) {
        // Index short prefixes
        for (int i = 1; i <= Math.min(word.length(), SHORT_PREFIX_LENGTH); i++) {
            String prefix = word.substring(0, i);
            TIntArrayList wordIds = prefixIndex.computeIfAbsent(
                    prefix, k -> new TIntArrayList(16));
            wordIds.add(wordId);
        }
        // Index longer prefixes
        for (int i = SHORT_PREFIX_LENGTH + 1; i <= Math.min(word.length(), MAX_INDEXED_PREFIX_LENGTH); i++) {
            String prefix = word.substring(0, i);
            TIntArrayList wordIds = longPrefixIndex.computeIfAbsent(
                    prefix, k -> new TIntArrayList(8));
            wordIds.add(wordId);
        }
        // If the word contains spaces, also index by each term for multi-word queries
        if (word.contains(" ")) {
            String[] terms = word.split("\\s+");
            for (String term : terms) {
                if (term.length() >= 2) {
                    for (int i = 1; i <= Math.min(term.length(), SHORT_PREFIX_LENGTH); i++) {
                        String termPrefix = "t:" + term.substring(0, i);
                        TIntArrayList wordIds = prefixIndex.computeIfAbsent(
                                termPrefix, k -> new TIntArrayList(16));
                        wordIds.add(wordId);
                    }
                }
            }
        }
    }
    /**
     * Inserts a word with its associated score.
     */
    public void insert(String word, int score) {
        if (word == null || word.isEmpty()) {
            return;
        }
        // Add to the word list and index
        int wordId = words.size();
        words.add(word);
        wordScores.add(score);
        indexPrefix(word, wordId);
    }
    /**
     * Returns the top k completions for a given prefix.
     */
    public List<ScoredSuggestion> getTopCompletions(String prefix, int k) {
        if (prefix == null || prefix.isEmpty()) {
            // Return top k words by score
            return getTopKWords(k);
        }
        // Check if this is a term search (t:) - for searching within multi-word items
        boolean isTermSearch = false;
        if (prefix.startsWith("t:") && prefix.length() > 2) {
            isTermSearch = true;
            prefix = prefix.substring(2);
        }
        // 1. Fast path for short prefixes
        if (prefix.length() <= SHORT_PREFIX_LENGTH) {
            String lookupPrefix = isTermSearch ? "t:" + prefix : prefix;
            TIntArrayList wordIds = prefixIndex.get(lookupPrefix);
            if (wordIds != null) {
                return getTopKFromWordIds(wordIds, k);
            }
        }
        // 2. Fast path for long prefixes (truncate to MAX_INDEXED_PREFIX_LENGTH)
        if (prefix.length() > SHORT_PREFIX_LENGTH) {
            // Try exact match in longPrefixIndex first
            if (prefix.length() <= MAX_INDEXED_PREFIX_LENGTH) {
                TIntArrayList wordIds = longPrefixIndex.get(prefix);
                if (wordIds != null) {
                    return getTopKFromWordIds(wordIds, k);
                }
            }
            // If prefix is longer than MAX_INDEXED_PREFIX_LENGTH, truncate and filter
            if (prefix.length() > MAX_INDEXED_PREFIX_LENGTH) {
                String truncatedPrefix = prefix.substring(0, MAX_INDEXED_PREFIX_LENGTH);
                TIntArrayList candidateIds = longPrefixIndex.get(truncatedPrefix);
                if (candidateIds != null) {
                    // Filter candidates by the full prefix
                    return getFilteredTopKFromWordIds(candidateIds, prefix, k);
                }
            }
        }
        // 3. Optimized fallback for long prefixes - use prefix tree for segments
        List<ScoredSuggestion> results = new ArrayList<>();
        // Handle multi-segment queries by finding candidates from first 8 chars
        if (prefix.length() > SHORT_PREFIX_LENGTH) {
            String shortPrefix = prefix.substring(0, Math.min(prefix.length(), SHORT_PREFIX_LENGTH));
            TIntArrayList candidates = prefixIndex.get(shortPrefix);
            if (candidates != null) {
                return getFilteredTopKFromWordIds(candidates, prefix, k);
            }
        }
        // 4. Last resort - optimized binary search in sorted segments
        return findByBinarySearchPrefix(prefix, k);
    }
    /**
     * Helper to get the top k words by score.
     */
    private List<ScoredSuggestion> getTopKWords(int k) {
        // Create pairs of (score, wordId)
        int[][] pairs = new int[words.size()][2];
        for (int i = 0; i < words.size(); i++) {
            pairs[i][0] = wordScores.get(i);
            pairs[i][1] = i;
        }
        // Sort by score (descending)
        Arrays.sort(pairs, (a, b) -> Integer.compare(b[0], a[0]));
        // Take top k
        List<ScoredSuggestion> results = new ArrayList<>();
        for (int i = 0; i < Math.min(k, pairs.length); i++) {
            String word = words.get(pairs[i][1]);
            int score = pairs[i][0];
            results.add(new ScoredSuggestion(word, score));
        }
        return results;
    }
    /**
     * Helper to get the top k words from a list of word IDs.
     */
    private List<ScoredSuggestion> getTopKFromWordIds(TIntArrayList wordIds, int k) {
        if (wordIds == null || wordIds.isEmpty()) {
            return Collections.emptyList();
        }
        // For small lists, avoid sorting
        if (wordIds.size() <= k) {
            List<ScoredSuggestion> results = new ArrayList<>(wordIds.size());
            int[] ids = wordIds.toArray();
            for (int wordId : ids) {
                if (wordId >= 0 && wordId < words.size()) {
                    results.add(new ScoredSuggestion(words.get(wordId), wordScores.get(wordId)));
                }
            }
            results.sort((a, b) -> Integer.compare(b.getScore(), a.getScore()));
            return results;
        }
        // For larger lists, use an array-based approach for better performance
        // Find top k without full sorting
        int[] topScores = new int[k];
        int[] topWordIds = new int[k];
        int[] ids = wordIds.toArray();
        // Initialize with first k elements
        int filledCount = Math.min(k, ids.length);
        for (int i = 0; i < filledCount; i++) {
            int wordId = ids[i];
            if (wordId >= 0 && wordId < words.size()) {
                topWordIds[i] = wordId;
                topScores[i] = wordScores.get(wordId);
            }
        }
        // Sort initial elements
        for (int i = 0; i < filledCount; i++) {
            for (int j = i + 1; j < filledCount; j++) {
                if (topScores[j] > topScores[i]) {
                    // Swap scores
                    int tempScore = topScores[i];
                    topScores[i] = topScores[j];
                    topScores[j] = tempScore;
                    // Swap word IDs
                    int tempId = topWordIds[i];
                    topWordIds[i] = topWordIds[j];
                    topWordIds[j] = tempId;
                }
            }
        }
        // Process remaining elements
        int minScore = filledCount > 0 ? topScores[filledCount - 1] : Integer.MIN_VALUE;
        for (int i = k; i < ids.length; i++) {
            int wordId = ids[i];
            if (wordId >= 0 && wordId < words.size()) {
                int score = wordScores.get(wordId);
                if (score > minScore) {
                    // Replace the lowest element
                    topScores[filledCount - 1] = score;
                    topWordIds[filledCount - 1] = wordId;
                    // Bubble up the new element
                    for (int j = filledCount - 1; j > 0; j--) {
                        if (topScores[j] > topScores[j - 1]) {
                            // Swap scores
                            int tempScore = topScores[j];
                            topScores[j] = topScores[j - 1];
                            topScores[j - 1] = tempScore;
                            // Swap word IDs
                            int tempId = topWordIds[j];
                            topWordIds[j] = topWordIds[j - 1];
                            topWordIds[j - 1] = tempId;
                        } else {
                            break;
                        }
                    }
                    // Update min score
                    minScore = topScores[filledCount - 1];
                }
            }
        }
        // Create result list
        List<ScoredSuggestion> results = new ArrayList<>(filledCount);
        for (int i = 0; i < filledCount; i++) {
            results.add(new ScoredSuggestion(words.get(topWordIds[i]), topScores[i]));
        }
        return results;
    }
    /**
     * Use binary search on sorted word segments to efficiently find matches.
     */
    private List<ScoredSuggestion> findByBinarySearchPrefix(String prefix, int k) {
        // If we have a lot of words, use an optimized segment approach
        if (words.size() > 1000) {
            // Divide words into segments for better locality
            int segmentSize = 1000;
            int numSegments = (words.size() + segmentSize - 1) / segmentSize;
            // Find matches using binary search within each segment
            List<WordScorePair> allMatches = new ArrayList<>();
            for (int segment = 0; segment < numSegments; segment++) {
                int start = segment * segmentSize;
                int end = Math.min(start + segmentSize, words.size());
                // Binary search for first potential match
                int pos = Collections.binarySearch(
                        words.subList(start, end),
                        prefix,
                        (a, b) -> a.compareTo(b)
                );
                if (pos < 0) {
                    pos = -pos - 1;
                }
                // Collect all matches
                for (int i = start + pos; i < end && i < words.size(); i++) {
                    String word = words.get(i);
                    if (word.startsWith(prefix)) {
                        allMatches.add(new WordScorePair(word, wordScores.get(i)));
                    } else if (word.compareTo(prefix) > 0) {
                        break; // Past potential matches
                    }
                }
            }
            // Sort by score and take top k
            allMatches.sort((a, b) -> Integer.compare(b.score, a.score));
            List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, allMatches.size()));
            for (int i = 0; i < Math.min(k, allMatches.size()); i++) {
                WordScorePair pair = allMatches.get(i);
                results.add(new ScoredSuggestion(pair.word, pair.score));
            }
            return results;
        }
        // Fallback for small dictionaries - linear scan but optimized
        return simpleSearchFallback(prefix, k);
    }
    /**
     * Optimized linear scan - only used for small dictionaries.
     */
    private List<ScoredSuggestion> simpleSearchFallback(String prefix, int k) {
        // Use primitive arrays for better cache locality
        int[] matchScores = new int[Math.min(words.size(), 100)]; // Assume we won't find more than 100 matches
        String[] matchWords = new String[matchScores.length];
        int matchCount = 0;
        for (int i = 0; i < words.size() && matchCount < matchScores.length; i++) {
            String word = words.get(i);
            if (word.startsWith(prefix)) {
                matchWords[matchCount] = word;
                matchScores[matchCount] = wordScores.get(i);
                matchCount++;
            }
        }
        // Sort matches by score (in-place for small arrays)
        for (int i = 0; i < matchCount; i++) {
            for (int j = i + 1; j < matchCount; j++) {
                if (matchScores[j] > matchScores[i]) {
                    // Swap scores
                    int tempScore = matchScores[i];
                    matchScores[i] = matchScores[j];
                    matchScores[j] = tempScore;
                    // Swap words
                    String tempWord = matchWords[i];
                    matchWords[i] = matchWords[j];
                    matchWords[j] = tempWord;
                }
            }
        }
        // Create results
        List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
        for (int i = 0; i < Math.min(k, matchCount); i++) {
            results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
        }
        return results;
    }
    /**
     * Get top k words from candidate IDs, filtering by the full prefix.
     */
    private List<ScoredSuggestion> getFilteredTopKFromWordIds(TIntArrayList wordIds, String fullPrefix, int k) {
        if (wordIds == null || wordIds.isEmpty()) {
            return Collections.emptyList();
        }
        // Make primitive arrays for better performance
        String[] matchWords = new String[Math.min(wordIds.size(), 1000)];
        int[] matchScores = new int[matchWords.length];
        int matchCount = 0;
        int[] ids = wordIds.toArray();
        for (int i = 0; i < ids.length && matchCount < matchWords.length; i++) {
            int wordId = ids[i];
            if (wordId >= 0 && wordId < words.size()) {
                String word = words.get(wordId);
                if (word.startsWith(fullPrefix)) {
                    matchWords[matchCount] = word;
                    matchScores[matchCount] = wordScores.get(wordId);
                    matchCount++;
                }
            }
        }
        // Sort by score (efficient insertion sort for small k)
        for (int i = 0; i < Math.min(matchCount, k); i++) {
            int maxPos = i;
            for (int j = i + 1; j < matchCount; j++) {
                if (matchScores[j] > matchScores[maxPos]) {
                    maxPos = j;
                }
            }
            if (maxPos != i) {
                // Swap
                int tempScore = matchScores[i];
                matchScores[i] = matchScores[maxPos];
                matchScores[maxPos] = tempScore;
                String tempWord = matchWords[i];
                matchWords[i] = matchWords[maxPos];
                matchWords[maxPos] = tempWord;
            }
        }
        // Create result list (only up to k elements)
        List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
        for (int i = 0; i < Math.min(k, matchCount); i++) {
            results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
        }
        return results;
    }
    /**
     * Class representing a suggested completion.
     */
    public static class ScoredSuggestion implements Comparable<ScoredSuggestion> {
        private final String word;
        private final int score;
        public ScoredSuggestion(String word, int score) {
            this.word = word;
            this.score = score;
        }
        public String getWord() {
            return word;
        }
        public int getScore() {
            return score;
        }
        @Override
        public String toString() {
            return word + " (" + score + ")";
        }
        @Override
        public int compareTo(@NotNull PrefixSearchStructure.ScoredSuggestion o) {
            return Integer.compare(this.score, o.score);
        }
    }
 }
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/Suggestions.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/Suggestions.java
@@ -2,74 +2,89 @@ package nu.marginalia.assistant.suggest;
 import com.google.inject.Inject;
 import com.google.inject.name.Named;
 import nu.marginalia.functions.math.dict.SpellChecker;
 import nu.marginalia.term_frequency_dict.TermFrequencyDict;
 import nu.marginalia.model.crawl.HtmlFeature;
 import org.apache.commons.collections4.trie.PatriciaTrie;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.*;
-import java.util.function.Supplier;
+import java.util.zip.GZIPInputStream;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 public class Suggestions {
-    private PatriciaTrie<String> suggestionsTrie = null;
+    List<PrefixSearchStructure> searchStructures = new ArrayList<>();
-    private TermFrequencyDict termFrequencyDict = null;
+
-    private volatile boolean ready = false;
+    private volatile boolean ready = false;
    private final SpellChecker spellChecker;
    private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
    private static final Logger logger = LoggerFactory.getLogger(Suggestions.class);
    private static final int MIN_SUGGEST_LENGTH = 3;
    @Inject
-    public Suggestions(@Named("suggestions-file") Path suggestionsFile,
+    public Suggestions(@Named("suggestions-file1") Path suggestionsFile1,
-                       SpellChecker spellChecker,
+                       @Named("suggestions-file2") Path suggestionsFile2
                       TermFrequencyDict dict
                       ) {
        this.spellChecker = spellChecker;
        Thread.ofPlatform().start(() -> {
-            suggestionsTrie = loadSuggestions(suggestionsFile);
+            searchStructures.add(loadSuggestions(suggestionsFile1));
-            termFrequencyDict = dict;
+            searchStructures.add(loadSuggestions(suggestionsFile2));
            ready = true;
-            logger.info("Loaded {} suggestions", suggestionsTrie.size());
+            logger.info("Loaded suggestions");
        });
    }
-    private static PatriciaTrie<String> loadSuggestions(Path file) {
+    private static PrefixSearchStructure loadSuggestions(Path file) {
        PrefixSearchStructure ret = new PrefixSearchStructure();
        if (!Files.exists(file)) {
            logger.error("Suggestions file {} absent, loading empty suggestions db", file);
-            return new PatriciaTrie<>();
+            return ret;
        }
        try (var lines = Files.lines(file)) {
            var ret = new PatriciaTrie<String>();
-            lines.filter(suggestionPattern.asPredicate())
+        try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
-                    .filter(line -> line.length()<32)
+            while (scanner.hasNextLine()) {
-                    .map(String::toLowerCase)
+                String line = scanner.nextLine().trim();
-                    .forEach(w -> ret.put(w, w));
+                String[] parts = StringUtils.split(line, " ,", 2);
                if (parts.length != 2) {
                    logger.warn("Invalid suggestion line: {}", line);
                    continue;
                }
                int cnt = Integer.parseInt(parts[0]);
                if (cnt > 1) {
                    String word = parts[1];
-            // Add special keywords to the suggestions
+                    // Remove quotes and trailing periods if this is a CSV
-            for (var feature : HtmlFeature.values()) {
+                    if (word.startsWith("\"") && word.endsWith("\"")) {
-                String keyword = feature.getKeyword();
+                        word = word.substring(1, word.length() - 1);
                    }
-                ret.put(keyword, keyword);
+                    // Remove trailing periods
-                ret.put("-" + keyword, "-" + keyword);
+                    while (word.endsWith(".")) {
                        word = word.substring(0, word.length() - 1);
                    }
                    // Remove junk items we may have gotten from link extraction
                    if (word.startsWith("click here"))
                        continue;
                    if (word.contains("new window"))
                        continue;
                    if (word.contains("click to"))
                        continue;
                    if (word.startsWith("share "))
                        continue;
                    if (word.length() > 3) {
                        ret.insert(word, cnt);
                    }
                }
            }
            return ret;
        }
        catch (IOException ex) {
            logger.error("Failed to load suggestions file", ex);
-            return new PatriciaTrie<>();
+            return new PrefixSearchStructure();
        }
    }
@@ -83,96 +98,36 @@ public class Suggestions {
        searchWord = StringUtils.stripStart(searchWord.toLowerCase(), " ");
-        return Stream.of(
+        return getSuggestionsForKeyword(count, searchWord);
                    new SuggestionStream("", getSuggestionsForKeyword(count, searchWord)),
                    suggestionsForLastWord(count, searchWord),
                    spellCheckStream(searchWord)
                )
                .flatMap(SuggestionsStreamable::stream)
                .limit(count)
                .collect(Collectors.toList());
    }
-    private SuggestionsStreamable suggestionsForLastWord(int count, String searchWord) {
+    public List<String> getSuggestionsForKeyword(int count, String prefix) {
        int sp = searchWord.lastIndexOf(' ');
        if (sp < 0) {
            return Stream::empty;
        }
        String prefixString = searchWord.substring(0, sp+1);
        String suggestString = searchWord.substring(sp+1);
        return new SuggestionStream(prefixString, getSuggestionsForKeyword(count, suggestString));
    }
    private SuggestionsStreamable spellCheckStream(String word) {
        int start = word.lastIndexOf(' ');
        String prefix;
        String corrWord;
        if (start < 0) {
            corrWord = word;
            prefix = "";
        }
        else {
            prefix = word.substring(0, start + 1);
            corrWord = word.substring(start + 1);
        }
        if (corrWord.length() >= MIN_SUGGEST_LENGTH) {
            Supplier<Stream<String>> suggestionsLazyEval = () -> spellChecker.correct(corrWord).stream();
            return new SuggestionStream(prefix, Stream.of(suggestionsLazyEval).flatMap(Supplier::get));
        }
        else {
            return Stream::empty;
        }
    }
    public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
        if (!ready)
-            return Stream.empty();
+            return List.of();
        if (prefix.length() < MIN_SUGGEST_LENGTH) {
-            return Stream.empty();
+            return List.of();
        }
-        var start = suggestionsTrie.select(prefix);
+        List<PrefixSearchStructure.ScoredSuggestion> resultsAll = new ArrayList<>();
-        if (start == null) {
+        for (var searchStructure : searchStructures) {
-            return Stream.empty();
+            resultsAll.addAll(searchStructure.getTopCompletions(prefix, count));
        }
        resultsAll.sort(Comparator.reverseOrder());
        List<String> ret = new ArrayList<>(count);
        Set<String> seen = new HashSet<>();
        for (var result : resultsAll) {
            if (seen.add(result.getWord())) {
                ret.add(result.getWord());
            }
            if (ret.size() >= count) {
                break;
            }
        }
-        if (!start.getKey().startsWith(prefix)) {
+        return ret;
            return Stream.empty();
        }
        SuggestionsValueCalculator sv = new SuggestionsValueCalculator();
        return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
                .takeWhile(s -> s.startsWith(prefix))
                .limit(256)
                .sorted(Comparator.comparing(sv::get).thenComparing(String::length).thenComparing(Comparator.naturalOrder()))
                .limit(count);
    }
    private record SuggestionStream(String prefix, Stream<String> suggestionStream) implements SuggestionsStreamable {
        public Stream<String> stream() {
            return suggestionStream.map(s -> prefix + s);
        }
    }
    interface SuggestionsStreamable { Stream<String> stream(); }
    private class SuggestionsValueCalculator {
        private final Map<String, Long> hashCache = new HashMap<>(512);
        public int get(String s) {
            long hash = hashCache.computeIfAbsent(s, TermFrequencyDict::getStringHash);
            return -termFrequencyDict.getTermFreqHash(hash);
        }
    }
 }
--- a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java
@@ -59,9 +59,14 @@ public class ControlMain extends MainClass {
            download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt"));
        }
-        Path suggestionsFile = dataPath.resolve("suggestions.txt");
+        Path suggestionsFile = dataPath.resolve("suggestions2.txt.gz");
        if (!Files.exists(suggestionsFile)) {
-            downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz"));
+            download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
        }
        Path altSuggestionsFile = dataPath.resolve("suggestions3.txt.gz");
        if (!Files.exists(altSuggestionsFile)) {
            download(altSuggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions3.txt.gz"));
        }
        Path asnRawData = dataPath.resolve("asn-data-raw-table");
--- a/code/services-core/control-service/resources/templates/control/node/actions/partial-download-sample-data.hdb
+++ b/code/services-core/control-service/resources/templates/control/node/actions/partial-download-sample-data.hdb
@@ -24,25 +24,25 @@ This is a sample of real crawl data.  It is intended for demo, testing and devel
    <tr>
        <td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
        <td><label for="sample-s">Small</label></td>
-        <td>1000 Domains. About 2 GB. </td>
+        <td>1000 Domains. About 1 GB. </td>
    </tr>
    <tr>
        <td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
        <td><label for="sample-m">Medium</label></td>
-        <td>2000 Domains. About 6 GB. Recommended.</td>
+        <td>2000 Domains. About 2 GB. Recommended.</td>
    </tr>
    <tr>
        <td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
        <td><label for="sample-l">Large</label></td>
-        <td>5000 Domains.  About 20 GB.</td>
+        <td>5000 Domains.  About 7 GB.</td>
    </tr>
    <tr>
        <td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
        <td><label for="sample-xl">Huge</label></td>
-        <td>50,000 Domains.  Around 180 GB.  Primarily intended for pre-production like testing environments.
+        <td>50,000 Domains.  Around 80 GB.  Primarily intended for pre-production like testing environments.
            Expect hours of processing time. </td>
    </tr>
 </table>
--- a/deploy.txt
+++ b/deploy.txt
@@ -1,4 +1,6 @@
-## This is a token file for automatic deployment
+## This is a token file for triggering automatic deployment when no commit is made.
 2025-01-08:  Deploy executor.
-2025-01-07:  Deploy executor.
+2025-01-07:  Deploy executor.
 2025-04-24:  Deploy executor.
 2025-04-24:  Deploy assistant.
Author	SHA1	Message	Date
Viktor Lofgren	8da74484f0	(search) Remove unused count modifier from the footer help	2025-04-27 12:08:34 +02:00
Viktor Lofgren	923d5a7234	(search) Add a note for TUI users pointing them to the old UI	2025-04-27 11:52:07 +02:00
Viktor Lofgren	58f88749b8	(deploy) assistant	2025-04-25 13:25:50 +02:00
Viktor Lofgren	77f727a5ba	(crawler) Alter conditional request logic to avoid sending both If-None-Match and If-Modified-Since It seems like some servers dislike this combination, and may turn a 304 into a 200.	2025-04-25 13:19:07 +02:00
Viktor Lofgren	667cfb53dc	(assistant) Remove more link text junk from suggestions at loadtime.	2025-04-24 13:35:29 +02:00
Viktor Lofgren	fe36d4ed20	(deploy) Executor services	2025-04-24 13:23:51 +02:00
Viktor Lofgren	acf4bef98d	(assistant) Improve search suggestions Improve suggestions by loading a secondary suggestions set with link text data.	2025-04-24 13:10:59 +02:00
Viktor Lofgren	2a737c34bb	(search) Improve suggestions UX Fix the highlight colors when arrowing through search suggestions. Also fix the suggestions box for dark mode.	2025-04-24 12:34:05 +02:00
Viktor Lofgren	90a577af82	(search) Improve suggestions UX	2025-04-24 00:32:25 +02:00
Viktor	f0c9b935d8	Merge pull request #192 from MarginaliaSearch/improve-suggestions Improve typeahead suggestions	2025-04-23 20:17:49 +02:00
Viktor Lofgren	7b5493dd51	(assistant) Improve typeahead suggestions Implement a new prefix search structure (not a trie, but hash table based) with a concept of score.	2025-04-23 20:13:53 +02:00
Viktor Lofgren	c246a59158	(search) Make it clearer that it's a search engine	2025-04-22 16:03:42 +02:00
Viktor	0b99781d24	Merge pull request #191 from MarginaliaSearch/pdf-support-in-crawler Pdf support in crawler	2025-04-22 15:52:41 +02:00
Viktor Lofgren	39db9620c1	(crawler) Increase maximum permitted file size to 32 MB	2025-04-22 15:51:03 +02:00
Viktor Lofgren	1781599363	(crawler) Add support for crawling PDF files	2025-04-22 15:50:05 +02:00
Viktor Lofgren	6b2d18fb9b	(crawler) Adjust domain limits to be generally more permissive.	2025-04-22 15:27:57 +02:00
Viktor	59b1d200ab	Merge pull request #190 from MarginaliaSearch/download-sample-chores Download sample chores	2025-04-22 13:29:49 +02:00
Viktor Lofgren	897010a2cf	(control) Update download sample data actor with better UI The original implementation didn't really give a lot of feedback about what it was doing. Adding a progress bar to the download step. Relates to issue 189.	2025-04-22 13:27:22 +02:00
Viktor Lofgren	602af7a77e	(control) Update UI with new sample sizes Relates to issue 189.	2025-04-22 13:27:13 +02:00
Viktor Lofgren	a7d91c8527	(crawler) Clean up fetcher detailed logging	2025-04-21 12:53:52 +02:00
Viktor Lofgren	7151602124	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Cleaning up after changes.	2025-04-21 12:47:03 +02:00
Viktor Lofgren	884e33bd4a	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change back to an unbounded queue, tighten sleep times a bit.	2025-04-21 11:48:15 +02:00
Viktor Lofgren	e84d5c497a	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.	2025-04-21 00:39:26 +02:00
Viktor Lofgren	2d2d3e2466	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.	2025-04-21 00:36:48 +02:00
Viktor Lofgren	647dd9b12f	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready	2025-04-21 00:24:30 +02:00