(search) Add debug logging for specific query

Revert "(query) Add debug logging for specific query"
2025-10-06 07:32:38 +02:00 · 2025-05-16 23:41:35 +02:00 · 2025-05-16 23:34:03 +02:00 · 2025-05-16 23:29:06 +02:00 · 2025-05-16 23:23:53 +02:00 · 2025-05-16 23:20:16 +02:00
114 changed files with 4627 additions and 531 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -5,7 +5,7 @@ plugins {

    // This is a workaround for a bug in the Jib plugin that causes it to stall randomly
    // https://github.com/GoogleContainerTools/jib/issues/3347
-    id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
+    id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
 }

 group 'marginalia'
@@ -47,7 +47,7 @@ ext {
    dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
    dockerImageTag='latest'
    dockerImageRegistry='marginalia'
-    jibVersion = '3.4.4'
+    jibVersion = '3.4.5'
 }

 idea {
--- a/code/common/model/java/nu/marginalia/model/DocumentFormat.java
+++ b/code/common/model/java/nu/marginalia/model/DocumentFormat.java
@@ -0,0 +1,24 @@
+package nu.marginalia.model;
+
+public enum DocumentFormat {
+    PLAIN(0, 1, "text"),
+    PDF(0, 1, "pdf"),
+    UNKNOWN(0, 1, "???"),
+    HTML123(0, 1, "html"),
+    HTML4(-0.1, 1.05, "html"),
+    XHTML(-0.1, 1.05, "html"),
+    HTML5(0.5, 1.1, "html");
+
+    /** Used to tune quality score */
+    public final double offset;
+    /** Used to tune quality score */
+    public final double scale;
+    public final String shortFormat;
+
+    DocumentFormat(double offset, double scale, String shortFormat) {
+        this.offset = offset;
+        this.scale = scale;
+        this.shortFormat = shortFormat;
+    }
+
+}
--- a/code/common/model/java/nu/marginalia/model/EdgeUrl.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeUrl.java
@@ -1,16 +1,14 @@
 package nu.marginalia.model;

 import nu.marginalia.util.QueryParams;
+import org.apache.commons.lang3.StringUtils;

 import javax.annotation.Nullable;
 import java.io.Serializable;
-import java.net.MalformedURLException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
+import java.net.*;
+import java.nio.charset.StandardCharsets;
 import java.util.Objects;
 import java.util.Optional;
-import java.util.regex.Pattern;

 public class EdgeUrl implements Serializable {
    public final String proto;
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {

    private static URI parseURI(String url) throws URISyntaxException {
        try {
-            return new URI(urlencodeFixer(url));
+            return EdgeUriFactory.parseURILenient(url);
        } catch (URISyntaxException ex) {
            throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
        }
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
        }
    }

-    private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
-
-    /* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
-
-       Here on the Internet, standards are like the picture on the box of the frozen pizza,
-       and what you get is more like what's on the inside, we try to patch things instead,
-       just give it a best-effort attempt att cleaning out broken or unnecessary constructions
-       like bad or missing URLEncoding
-     */
-    public static String urlencodeFixer(String url) throws URISyntaxException {
-        var s = new StringBuilder();
-        String goodChars = "&.?:/-;+$#";
-        String hexChars = "0123456789abcdefABCDEF";
-
-        int pathIdx = findPathIdx(url);
-        if (pathIdx < 0) { // url looks like http://marginalia.nu
-            return url + "/";
-        }
-        s.append(url, 0, pathIdx);
-
-        // We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
-        int end = url.indexOf("#");
-        if (end < 0) end = url.length();
-
-        for (int i = pathIdx; i < end; i++) {
-            int c = url.charAt(i);
-
-            if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
-                s.appendCodePoint(c);
-            } else if (c == '%' && i + 2 < end) {
-                int cn = url.charAt(i + 1);
-                int cnn = url.charAt(i + 2);
-                if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
-                    s.appendCodePoint(c);
-                } else {
-                    s.append("%25");
-                }
-            } else {
-                s.append(String.format("%%%02X", c));
-            }
-        }
-
-        return s.toString();
-    }
-
-    private static int findPathIdx(String url) throws URISyntaxException {
-        int colonIdx = url.indexOf(':');
-        if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
-            throw new URISyntaxException(url, "Lacking protocol");
-        }
-        return url.indexOf('/', colonIdx + 2);
-    }

    public EdgeUrl(URI URI) {
        try {
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
            sb.append(port);
        }

+        EdgeUriFactory.urlencodePath(sb, path);
+
+        if (param != null) {
+            EdgeUriFactory.urlencodeQuery(sb, param);
+        }
+
+        return sb.toString();
+    }
+
+
+    public String toDisplayString() {
+        StringBuilder sb = new StringBuilder(256);
+
+        sb.append(proto);
+        sb.append("://");
+        sb.append(domain);
+
+        if (port != null) {
+            sb.append(':');
+            sb.append(port);
+        }
+
        sb.append(path);

        if (param != null) {
-            sb.append('?');
-            sb.append(param);
+            sb.append('?').append(param);
        }

        return sb.toString();
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
    }

 }
+
+class EdgeUriFactory {
+    public static URI parseURILenient(String url) throws URISyntaxException {
+
+        if (shouldOmitUrlencodeRepair(url)) {
+            try {
+                return new URI(url);
+            }
+            catch (URISyntaxException ex) {
+                // ignore and run the lenient parser
+            }
+        }
+
+        var s = new StringBuilder(url.length()+8);
+
+        int pathIdx = findPathIdx(url);
+        if (pathIdx < 0) { // url looks like http://marginalia.nu
+            return new URI(url + "/");
+        }
+        s.append(url, 0, pathIdx);
+
+        // We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
+        int end = url.indexOf("#");
+        if (end < 0) end = url.length();
+
+        int queryIdx = url.indexOf('?');
+        if (queryIdx < 0) queryIdx = end;
+
+        urlencodePath(s, url.substring(pathIdx, queryIdx));
+        if (queryIdx < end) {
+            urlencodeQuery(s, url.substring(queryIdx + 1, end));
+        }
+        return new URI(s.toString());
+    }
+
+    /** Break apart the path element of an URI into its components, and then
+     * urlencode any component that needs it, and recombine it into a single
+     * path element again.
+     */
+    public static void urlencodePath(StringBuilder sb, String path) {
+        if (path == null || path.isEmpty()) {
+            return;
+        }
+
+        String[] pathParts = StringUtils.split(path, '/');
+        if (pathParts.length == 0) {
+            sb.append('/');
+            return;
+        }
+
+        boolean shouldUrlEncode = false;
+        for (String pathPart : pathParts) {
+            if (pathPart.isEmpty()) continue;
+
+            if (needsUrlEncode(pathPart)) {
+                shouldUrlEncode = true;
+                break;
+            }
+        }
+
+        for (String pathPart : pathParts) {
+            if (pathPart.isEmpty()) continue;
+
+            if (shouldUrlEncode) {
+                sb.append('/');
+                sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
+            } else {
+                sb.append('/');
+                sb.append(pathPart);
+            }
+        }
+
+        if (path.endsWith("/")) {
+            sb.append('/');
+        }
+
+    }
+
+    /** Break apart the query element of a URI into its components, and then
+     * urlencode any component that needs it, and recombine it into a single
+     * query element again.
+     */
+    public static void urlencodeQuery(StringBuilder sb, String param) {
+        if (param == null || param.isEmpty()) {
+            return;
+        }
+
+        String[] queryParts = StringUtils.split(param, '&');
+
+        boolean shouldUrlEncode = false;
+        for (String queryPart : queryParts) {
+            if (queryPart.isEmpty()) continue;
+
+            if (needsUrlEncode(queryPart)) {
+                shouldUrlEncode = true;
+                break;
+            }
+        }
+
+        boolean first = true;
+        for (String queryPart : queryParts) {
+            if (queryPart.isEmpty()) continue;
+
+            if (first) {
+                sb.append('?');
+                first = false;
+            } else {
+                sb.append('&');
+            }
+
+            if (shouldUrlEncode) {
+                int idx = queryPart.indexOf('=');
+                if (idx < 0) {
+                    sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
+                } else {
+                    sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
+                    sb.append('=');
+                    sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
+                }
+            } else {
+                sb.append(queryPart);
+            }
+        }
+    }
+
+    /** Test if the url element needs URL encoding.
+     * <p></p>
+     * Note we may have been given an already encoded path element,
+     * so we include % and + in the list of good characters
+     */
+    static boolean needsUrlEncode(String urlElement) {
+        for (int i = 0; i < urlElement.length(); i++) {
+            char c = urlElement.charAt(i);
+
+            if (isUrlSafe(c)) continue;
+            if ("+".indexOf(c) >= 0) continue;
+            if (c == '%' && i + 2 < urlElement.length()) {
+                char c1 = urlElement.charAt(i + 1);
+                char c2 = urlElement.charAt(i + 2);
+                if (isHexDigit(c1) && isHexDigit(c2)) {
+                    i += 2;
+                    continue;
+                }
+            }
+
+            return true;
+        }
+
+        return false;
+    }
+
+
+    static boolean isUrlSafe(int c) {
+        if (c >= 'a' && c <= 'z') return true;
+        if (c >= 'A' && c <= 'Z') return true;
+        if (c >= '0' && c <= '9') return true;
+        if (c == '-' || c == '_' || c == '.' || c == '~') return true;
+
+        return false;
+    }
+
+    /** Test if the URL is a valid URL that does not need to be
+     * urlencoded.
+     * <p></p>
+     * This is a very simple heuristic test that does not guarantee
+     * that the URL is valid, but it will identify cases where we
+     * are fairly certain that the URL does not need encoding,
+     * so we can skip a bunch of allocations and string operations
+     * that would otherwise be needed to fix the URL.
+     */
+    static boolean shouldOmitUrlencodeRepair(String url) {
+        int idx = 0;
+        final int len = url.length();
+
+        // Validate the scheme
+        while (idx < len - 2) {
+            char c = url.charAt(idx++);
+            if (c == ':') break;
+            if (!isAsciiAlphabetic(c)) return false;
+        }
+        if (url.charAt(idx++) != '/') return false;
+        if (url.charAt(idx++) != '/') return false;
+
+        // Validate the authority
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '/') break;
+            if (c == ':') continue;
+            if (c == '@') continue;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        // Validate the path
+        if (idx >= len) return true;
+
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '?') break;
+            if (c == '/') continue;
+            if (c == '#') return true;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        if (idx >= len) return true;
+
+        // Validate the query
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '&') continue;
+            if (c == '=') continue;
+            if (c == '#') return true;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        return true;
+    }
+
+
+    private static boolean isAsciiAlphabetic(int c) {
+        return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    private static boolean isHexDigit(int c) {
+        return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    /** Find the index of the path element in a URL.
+     * <p></p>
+     * The path element starts after the scheme and authority part of the URL,
+     * which is everything up to and including the first slash after the colon.
+     */
+    private static int findPathIdx(String url) throws URISyntaxException {
+        int colonIdx = url.indexOf(':');
+        if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
+            throw new URISyntaxException(url, "Lacking scheme");
+        }
+        return url.indexOf('/', colonIdx + 3);
+    }
+
+
+}
--- a/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java
+++ b/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java
@@ -28,6 +28,8 @@ public enum HtmlFeature {

    GA_SPAM("special:gaspam"),

+    PDF("format:pdf"),
+
    /** For fingerprinting and ranking */
    OPENGRAPH("special:opengraph"),
    OPENGRAPH_IMAGE("special:opengraph:image"),
--- a/code/common/model/java/nu/marginalia/model/html/HtmlStandard.java
+++ b/code/common/model/java/nu/marginalia/model/html/HtmlStandard.java
@@ -1,22 +0,0 @@
-package nu.marginalia.model.html;
-
-// This class really doesn't belong anywhere, but will squat here for now
-public enum HtmlStandard {
-    PLAIN(0, 1),
-    UNKNOWN(0, 1),
-    HTML123(0, 1),
-    HTML4(-0.1, 1.05),
-    XHTML(-0.1, 1.05),
-    HTML5(0.5, 1.1);
-
-    /** Used to tune quality score */
-    public final double offset;
-    /** Used to tune quality score */
-    public final double scale;
-
-    HtmlStandard(double offset, double scale) {
-        this.offset = offset;
-        this.scale = scale;
-    }
-
-}
--- a/code/common/model/java/nu/marginalia/model/idx/DocumentFlags.java
+++ b/code/common/model/java/nu/marginalia/model/idx/DocumentFlags.java
@@ -9,7 +9,7 @@ public enum DocumentFlags {
    GeneratorForum,
    GeneratorWiki,
    Sideloaded,
-    Unused7,
+    PdfFile,
    Unused8,
    ;

--- a/code/common/model/test/nu/marginalia/model/EdgeUrlTest.java
+++ b/code/common/model/test/nu/marginalia/model/EdgeUrlTest.java
@@ -1,6 +1,6 @@
 package nu.marginalia.model;

-import nu.marginalia.model.EdgeUrl;
+import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;

 import java.net.URISyntaxException;
@@ -21,25 +21,70 @@ class EdgeUrlTest {
                new EdgeUrl("https://memex.marginalia.nu/#here")
        );
    }
+
    @Test
-    public void testParam() throws URISyntaxException {
-        System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
-        System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
-    }
-    @Test
-    void urlencodeFixer() throws URISyntaxException {
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
+    void testUriFromString() throws URISyntaxException {
+        // We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
+        // converting it back to a string, we want to ensure there is no changes along the way.
+
+        Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
+        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
+        Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
+
+        Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
+        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
+        Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
+
+        Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
+        Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
+        Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
+
+        Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
+
+        Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
+
+        Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
+
+        Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
+        Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
+        Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
+
+        Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
+        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
+        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
+
+        Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
    }

    @Test
    void testParms() throws URISyntaxException {
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
+        Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
+
+        Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
+
+        Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
+
+        Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
+                new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
+
+
+        Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
+
+        Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
+
+        Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
+        Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
    }
 }
--- a/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java
+++ b/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java
@@ -59,17 +59,14 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
     */
    @Override
    public void progress(String step, int stepProgress, int stepCount) {
+        int lastProgress = this.progress;
        this.step = step;
-
-
-        // off by one since we calculate the progress based on the number of steps,
-        // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
-        // final progress being 80% and not 100%)
-
        this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);

+        if (this.progress / 10 != lastProgress / 10) {
            logger.info("ProcessTask {} progress: {}%", taskBase, progress);
        }
+    }

    /** Wrap a collection to provide heartbeat progress updates as it's iterated through */
    @Override
--- a/code/common/service/java/nu/marginalia/service/control/ServiceAdHocTaskHeartbeatImpl.java
+++ b/code/common/service/java/nu/marginalia/service/control/ServiceAdHocTaskHeartbeatImpl.java
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
     */
    @Override
    public void progress(String step, int stepProgress, int stepCount) {
+        int lastProgress = this.progress;
        this.step = step;
-
-
-        // off by one since we calculate the progress based on the number of steps,
-        // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
-        // final progress being 80% and not 100%)
-
        this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);

-        logger.info("ServiceTask {} progress: {}%", taskBase, progress);
+        if (this.progress / 10 != lastProgress / 10) {
+            logger.info("ProcessTask {} progress: {}%", taskBase, progress);
+        }
    }

    public void shutDown() {
--- a/code/common/service/java/nu/marginalia/service/server/JoobyService.java
+++ b/code/common/service/java/nu/marginalia/service/server/JoobyService.java
@@ -122,6 +122,11 @@ public class JoobyService {
        // single digit percentage difference since HTML already compresses very well with level = 1.
        options.setCompressionLevel(1);

+        // Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
+        // multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
+        // scenario
+        options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
+

        jooby.setServerOptions(options);

--- a/code/common/service/resources/log4j2-json.xml
+++ b/code/common/service/resources/log4j2-json.xml
@@ -3,11 +3,18 @@
        <Console name="Console" target="SYSTEM_OUT">
            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
            </Filters>
        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
+            </Filters>
+        </Console>
        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
@@ -15,6 +22,7 @@
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
            </Filters>
            <SizeBasedTriggeringPolicy size="10MB" />
        </RollingFile>
@@ -31,9 +39,11 @@
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
            <AppenderRef ref="Console"/>
+            <AppenderRef ref="ProcessConsole"/>
            <AppenderRef ref="LogToFile"/>
        </Root>
    </Loggers>
--- a/code/common/service/resources/log4j2-prod.xml
+++ b/code/common/service/resources/log4j2-prod.xml
@@ -1,13 +1,51 @@
 <Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
    <Appenders>
-        <Console name="Console" target="SYSTEM_OUT">
-            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
+        <Console name="ConsoleInfo" target="SYSTEM_OUT">
+            <PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
            <Filters>
+                <LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
            </Filters>
        </Console>
+        <Console name="ConsoleWarn" target="SYSTEM_OUT">
+            <PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleError" target="SYSTEM_OUT">
+            <PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleFatal" target="SYSTEM_OUT">
+            <PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
+            </Filters>
+        </Console>
        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <PatternLayout>
@@ -34,9 +72,14 @@
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
-            <AppenderRef ref="Console"/>
+            <AppenderRef ref="ConsoleInfo"/>
+            <AppenderRef ref="ConsoleWarn"/>
+            <AppenderRef ref="ConsoleError"/>
+            <AppenderRef ref="ConsoleFatal"/>
+            <AppenderRef ref="ProcessConsole"/>
            <AppenderRef ref="LogToFile"/>
        </Root>
    </Loggers>
--- a/code/common/service/resources/log4j2-test.xml
+++ b/code/common/service/resources/log4j2-test.xml
@@ -1,15 +1,50 @@
 <Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
    <Appenders>
-        <Console name="Console" target="SYSTEM_OUT">
-            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
+        <Console name="ConsoleInfo" target="SYSTEM_OUT">
+            <PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleWarn" target="SYSTEM_OUT">
+            <PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleError" target="SYSTEM_OUT">
+            <PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleFatal" target="SYSTEM_OUT">
+            <PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
+            </Filters>
        </Console>
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
-            <AppenderRef ref="Console"/>
-            <AppenderRef ref="LogToFile"/>
+            <AppenderRef ref="ConsoleInfo"/>
+            <AppenderRef ref="ConsoleWarn"/>
+            <AppenderRef ref="ConsoleError"/>
+            <AppenderRef ref="ConsoleFatal"/>
+            <AppenderRef ref="ProcessConsole"/>
        </Root>
    </Loggers>
 </Configuration>
--- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java
+++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
        return msgId;
    }

-    public void exportSampleData(int node, FileStorageId fid, int size, String name) {
+    public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
        channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
                .forNode(node)
                .run(RpcExportSampleData.newBuilder()
                        .setFileStorageId(fid.id())
                        .setSize(size)
+                        .setCtFilter(ctFilter)
                        .setName(name)
                        .build());
    }
--- a/code/execution/api/src/main/protobuf/executor-api.proto
+++ b/code/execution/api/src/main/protobuf/executor-api.proto
@@ -100,6 +100,7 @@ message RpcExportSampleData {
  int64 fileStorageId = 1;
  int32 size = 2;
  string name = 3;
+  string ctFilter = 4;
 }
 message RpcDownloadSampleData {
  string sampleSet = 1;
--- a/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
 import nu.marginalia.actor.state.ActorStep;
 import nu.marginalia.actor.state.Resume;
 import nu.marginalia.service.control.ServiceEventLog;
+import nu.marginalia.service.control.ServiceHeartbeat;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageId;
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.*;
+import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {

    private final FileStorageService storageService;
    private final ServiceEventLog eventLog;
+    private final ServiceHeartbeat heartbeat;
    private final Logger logger = LoggerFactory.getLogger(getClass());

    @Resume(behavior = ActorResumeBehavior.ERROR)
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {

                Files.deleteIfExists(Path.of(tarFileName));

-                try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
+                HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
+
+                try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
+                    long size = urlConnection.getContentLengthLong();
+                    byte[] buffer = new byte[8192];
+
+                    try (var is = new BufferedInputStream(urlConnection.getInputStream());
                         var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
-                    is.transferTo(os);
+                        long copiedSize = 0;
+
+                        while (copiedSize < size) {
+                            int read = is.read(buffer);
+
+                            if (read < 0) // We've been promised a file of length 'size'
+                                throw new IOException("Unexpected end of stream");
+
+                            os.write(buffer, 0, read);
+                            copiedSize += read;
+
+                            // Update progress bar
+                            hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
+                        }
+                    }
+
                }
                catch (Exception ex) {
                    eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
                    logger.error("Error downloading sample", ex);
                    yield new Error();
                }
+                finally {
+                    urlConnection.disconnect();
+                }

                eventLog.logEvent(DownloadSampleActor.class, "Download complete");
                yield new Extract(fileStorageId, tarFileName);
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
    @Inject
    public DownloadSampleActor(Gson gson,
                               FileStorageService storageService,
-                               ServiceEventLog eventLog)
+                               ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
    {
        super(gson);
        this.storageService = storageService;
        this.eventLog = eventLog;
+        this.heartbeat = heartbeat;
    }

 }
--- a/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java
@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
    private final MqOutbox exportTasksOutbox;
    private final Logger logger = LoggerFactory.getLogger(getClass());

-    public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
-    public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
-        public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
-            this(crawlId, destId, size, name, -1);
+    public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
+    public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
+        public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
+            this(crawlId, destId, size, name, ctFilter,-1);
        }
    }

    @Override
    public ActorStep transition(ActorStep self) throws Exception {
        return switch(self) {
-            case Export(FileStorageId crawlId, int size, String name) -> {
+            case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
                var storage = storageService.allocateStorage(FileStorageType.EXPORT,
                        "crawl-sample-export",
                        "Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
                );

                if (storage == null) yield new Error("Bad storage id");
-                yield new Run(crawlId, storage.id(), size, name);
+                yield new Run(crawlId, storage.id(), size, ctFilter, name);
            }
-            case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
+            case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
                storageService.setFileStorageState(destId, FileStorageState.NEW);

-                long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
-                yield new Run(crawlId, destId, size, name, newMsgId);
+                long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
+                yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
            }
-            case Run(_, FileStorageId destId, _, _, long msgId) -> {
+            case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
                var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);

                if (rsp.state() != MqMessageState.OK) {
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {

    @Override
    public String describe() {
-        return "Export RSS/Atom feeds from crawl data";
+        return "Export sample crawl data";
    }

    @Inject
--- a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java
+++ b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
                    new ExportSampleDataActor.Export(
                            FileStorageId.of(request.getFileStorageId()),
                            request.getSize(),
+                            request.getCtFilter(),
                            request.getName()
                    )
            );
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@@ -229,13 +229,15 @@ public class FeedFetcherService {
                    .timeout(Duration.ofSeconds(15))
                    ;

-            if (ifModifiedSinceDate != null) {
+            // Set the If-Modified-Since or If-None-Match headers if we have them
+            // though since there are certain idiosyncrasies in server implementations,
+            // we avoid setting both at the same time as that may turn a 304 into a 200.
+            if (ifNoneMatchTag != null) {
+                requestBuilder.header("If-None-Match", ifNoneMatchTag);
+            } else if (ifModifiedSinceDate != null) {
                requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
            }

-            if (ifNoneMatchTag != null) {
-                requestBuilder.header("If-None-Match", ifNoneMatchTag);
-            }

            HttpRequest getRequest = requestBuilder.build();

--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/SimpleFeedParser.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/SimpleFeedParser.java
@@ -79,9 +79,17 @@ public class SimpleFeedParser {
                if (!link.isBlank())
                    break;
                var tag = element.getElementsByTag(attr).first();
+
                if (tag != null) {
-                    link = tag.text();
+                    String linkText = tag.text();
+
+                    if (linkText.isBlank()) {
+                        linkText = tag.attr("href");
                    }
+
+                    link = linkText;
+                }
+
            }

            ret.add(new ItemData(title, description, link, pubDate));
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
@@ -1,6 +1,7 @@
 package nu.marginalia.api.searchquery.model.results;

 import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import org.jetbrains.annotations.NotNull;

@@ -161,4 +162,14 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
    public String toString() {
        return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
    }
+
+    public String getShortFormat() {
+        try {
+            var df = DocumentFormat.valueOf(format);
+            return df.shortFormat;
+        }
+        catch (IllegalArgumentException e) {
+            return DocumentFormat.UNKNOWN.shortFormat;
+        }
+    }
 }
--- a/code/processes/converting-process/build.gradle
+++ b/code/processes/converting-process/build.gradle
@@ -62,6 +62,7 @@ dependencies {
    implementation libs.jwarc

    implementation libs.jsoup
+    implementation libs.pdfbox

    implementation libs.guava
    implementation dependencies.create(libs.guice.get()) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java
@@ -1,8 +1,8 @@
 package nu.marginalia.converting.model;

+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentMetadata;

 import javax.annotation.Nullable;
@@ -21,7 +21,7 @@ public class ProcessedDocumentDetails {
    public long hashCode;

    public Set<HtmlFeature> features;
-    public HtmlStandard standard;
+    public DocumentFormat format;

    public List<EdgeUrl> linksInternal;
    public List<EdgeUrl> linksExternal;
@@ -30,6 +30,6 @@ public class ProcessedDocumentDetails {
    public GeneratorType generator;

    public String toString() {
-        return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
+        return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.format + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
    }
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
 import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
 import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
+import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
 import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
 import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.model.EdgeDomain;
@@ -33,7 +34,8 @@ public class DocumentProcessor {
    private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
            "application/xhtml",
            "text/html",
-            "text/plain");
+            "text/plain",
+            "application/pdf");


    private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
@@ -42,12 +44,14 @@ public class DocumentProcessor {
    @Inject
    public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
                             PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
+                             PdfDocumentProcessorPlugin pdfDocumentProcessorPlugin,
                             AnchorTextKeywords anchorTextKeywords)
    {
        this.anchorTextKeywords = anchorTextKeywords;

        processorPlugins.add(htmlDocumentProcessorPlugin);
        processorPlugins.add(plainTextDocumentProcessorPlugin);
+        processorPlugins.add(pdfDocumentProcessorPlugin);
    }

    public ProcessedDocument process(CrawledDocument crawledDocument,
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.logic;

 import crawlercommons.utils.Strings;
 import nu.marginalia.converting.model.DisqualifiedException;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -17,7 +17,7 @@ import java.util.Set;
 public class DocumentValuator {

    public double getQuality(CrawledDocument crawledDocument,
-                             HtmlStandard htmlStandard,
+                             DocumentFormat htmlStandard,
                             Document parsedDocument,
                             int textLength) throws DisqualifiedException {

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
@@ -1,7 +1,7 @@
 package nu.marginalia.converting.processor.logic;

 import com.google.common.base.Strings;
-import nu.marginalia.model.html.HtmlStandard;
+import nu.marginalia.model.DocumentFormat;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.DocumentType;
 import org.slf4j.Logger;
@@ -12,54 +12,54 @@ public class HtmlStandardExtractor {

    private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);

-    public static HtmlStandard parseDocType(DocumentType docType) {
+    public static DocumentFormat parseDocType(DocumentType docType) {
        if (null == docType) {
-            return HtmlStandard.UNKNOWN;
+            return DocumentFormat.UNKNOWN;
        }

        String publicId = docType.publicId();
        if (Strings.isNullOrEmpty(publicId))
-            return HtmlStandard.HTML5;
+            return DocumentFormat.HTML5;

        publicId = publicId.toUpperCase();
        if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;
        }
        if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        }
        if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//SQ//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML//EN"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-/W3C//DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-/W3C/DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//W3C//DTD XHTML"))
-            return HtmlStandard.XHTML;
+            return DocumentFormat.XHTML;
        if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
-            return HtmlStandard.XHTML;
+            return DocumentFormat.XHTML;
        if (publicId.startsWith("-//W3C//DTD HTML"))
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;

        logger.debug("Unknown publicID standard {}", publicId);
-        return HtmlStandard.UNKNOWN;
+        return DocumentFormat.UNKNOWN;
    }

-    public static HtmlStandard sniffHtmlStandard(Document parsed) {
+    public static DocumentFormat sniffHtmlStandard(Document parsed) {
        int html4Attributes = 0;
        int html5Attributes = 0;

@@ -73,11 +73,11 @@ public class HtmlStandardExtractor {
            html4Attributes++;
        }
        if (html5Attributes > 0) {
-            return HtmlStandard.HTML5;
+            return DocumentFormat.HTML5;
        }
        if (html4Attributes > 0) {
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;
        }
-        return HtmlStandard.HTML123;
+        return DocumentFormat.HTML123;
    }
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
@@ -7,11 +7,11 @@ import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.filter.LanguageFilter;
 import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;

 import javax.annotation.Nullable;
 import java.io.IOException;
@@ -73,7 +73,7 @@ public abstract class AbstractDocumentProcessorPlugin {
            return this;
        }

-        public MetaTagsBuilder addFormat(HtmlStandard standard) {
+        public MetaTagsBuilder addFormat(DocumentFormat standard) {

            add("format", standard);

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
@@ -25,12 +25,12 @@ import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
 import nu.marginalia.link_parser.FeedExtractor;
 import nu.marginalia.link_parser.LinkParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import org.jsoup.nodes.Document;
@@ -137,8 +137,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin


        final int length = getLength(doc);
-        final HtmlStandard standard = getHtmlStandard(doc);
-        final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
+        final DocumentFormat format = getDocumentFormat(doc);
+        final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);

        if (isDisqualified(documentClass, url, quality, doc.title())) {
            throw new DisqualifiedException(DisqualificationReason.QUALITY);
@@ -152,7 +152,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        var ret = new ProcessedDocumentDetails();

        ret.length = length;
-        ret.standard = standard;
+        ret.format = format;
        ret.title = specialization.getTitle(doc, dld, crawledDocument.url);

        final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
@@ -161,7 +161,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        ret.quality = documentValuator.adjustQuality(quality, features);
        ret.hashCode = dld.localitySensitiveHashCode();

-        PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
+        PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, format, true);

        EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());

@@ -180,7 +180,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
                .addPubDate(pubDate)
                .addUrl(url)
                .addFeatures(features)
-                .addFormat(standard)
+                .addFormat(format)
                .addGenerator(generatorParts.keywords())
                .build();

@@ -316,12 +316,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        return linkTerms;
    }

-    private HtmlStandard getHtmlStandard(Document doc) {
-        HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
-        if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
+    private DocumentFormat getDocumentFormat(Document doc) {
+        DocumentFormat format = HtmlStandardExtractor.parseDocType(doc.documentType());
+        if (DocumentFormat.UNKNOWN.equals(format)) {
            return HtmlStandardExtractor.sniffHtmlStandard(doc);
        }
-        return htmlStandard;
+        return format;
    }

    private int getLength(Document doc) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPlugin.java
@@ -0,0 +1,286 @@
+package nu.marginalia.converting.processor.plugin;
+
+import com.google.inject.Inject;
+import com.google.inject.name.Named;
+import nu.marginalia.converting.model.DisqualifiedException;
+import nu.marginalia.converting.model.ProcessedDocumentDetails;
+import nu.marginalia.converting.processor.DocumentClass;
+import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
+import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
+import nu.marginalia.keyword.DocumentKeywordExtractor;
+import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
+import nu.marginalia.language.filter.LanguageFilter;
+import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
+import nu.marginalia.model.EdgeUrl;
+import nu.marginalia.model.crawl.HtmlFeature;
+import nu.marginalia.model.crawl.PubDate;
+import nu.marginalia.model.crawldata.CrawledDocument;
+import nu.marginalia.model.idx.DocumentFlags;
+import nu.marginalia.model.idx.DocumentMetadata;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.text.HeadingAwarePDFTextStripper;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.time.LocalDate;
+import java.util.*;
+
+
+public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
+
+    private final int maxTitleLength;
+    private final DocumentKeywordExtractor keywordExtractor;
+    private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
+    private final DocumentLengthLogic documentLengthLogic;
+    private final DefaultSpecialization defaultSpecialization;
+
+    private static final Logger logger = LoggerFactory.getLogger(PdfDocumentProcessorPlugin.class);
+
+    @Inject
+    public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
+                                      LanguageFilter languageFilter,
+                                      ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
+                                      DocumentKeywordExtractor keywordExtractor,
+                                      DocumentLengthLogic documentLengthLogic,
+                                      DefaultSpecialization defaultSpecialization)
+
+    {
+        super(languageFilter);
+        this.sentenceExtractorProvider = sentenceExtractorProvider;
+        this.documentLengthLogic = documentLengthLogic;
+        this.maxTitleLength = maxTitleLength;
+        this.keywordExtractor = keywordExtractor;
+        this.defaultSpecialization = defaultSpecialization;
+    }
+
+    @Override
+    public boolean isApplicable(CrawledDocument doc) {
+        String contentType = doc.contentType.toLowerCase();
+
+        if (contentType.equals("application/pdf"))
+            return true;
+        if (contentType.startsWith("application/pdf;")) // charset=blabla
+            return true;
+
+        return false;
+    }
+
+    @Override
+    public DetailsWithWords createDetails(CrawledDocument crawledDocument,
+                                          LinkTexts linkTexts,
+                                          DocumentClass documentClass)
+            throws DisqualifiedException, URISyntaxException, IOException {
+
+        String documentBody = crawledDocument.documentBody();
+
+        if (languageFilter.isBlockedUnicodeRange(documentBody)) {
+            throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
+        }
+
+        final EdgeUrl url = new EdgeUrl(crawledDocument.url);
+
+
+        Document doc;
+        try {
+            doc = convertPdfToHtml(crawledDocument.documentBodyBytes);
+        } catch (IOException e) {
+            logger.error("Failed to convert PDF file {} - {}", url, e.getMessage());
+            throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ERROR);
+        }
+
+        DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
+
+        checkDocumentLanguage(dld);
+
+        documentLengthLogic.validateLength(dld, 1.0);
+
+        var ret = new ProcessedDocumentDetails();
+
+        ret.length = documentBody.length();
+
+        ret.format = DocumentFormat.PDF;
+        ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
+
+        ret.quality = -5;
+
+        ret.features = Set.of(HtmlFeature.PDF);
+        ret.description = getDescription(doc);
+        ret.hashCode = dld.localitySensitiveHashCode();
+
+        final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
+
+        EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PdfFile);
+
+        ret.metadata = new DocumentMetadata(
+                documentLengthLogic.getEncodedAverageLength(dld),
+                pubDate.yearByte(),
+                (int) -ret.quality,
+                documentFlags);
+
+        DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
+
+        var tagWords = new MetaTagsBuilder()
+                .addPubDate(pubDate)
+                .addUrl(url)
+                .addFeatures(ret.features)
+                .addFormat(ret.format)
+                .build();
+
+        words.addAllSyntheticTerms(tagWords);
+
+        if (pubDate.hasYear()) {
+            ret.pubYear = pubDate.year();
+        }
+
+        /* These are assumed to be populated */
+        ret.linksInternal = new ArrayList<>();
+        ret.linksExternal = new ArrayList<>();
+
+        return new DetailsWithWords(ret, words);
+    }
+
+    private String getDescription(Document doc) {
+        int cnt = 0;
+        boolean useNext = false;
+        for (var ptag : doc.getElementsByTag("p")) {
+            String text = ptag.text();
+
+            // Many academic documents have an abstract at the start of the document,
+            // which makes a nice summary.  Though they tend to bleed into the text,
+            // so we check for the word "Abstract" at the start of the paragraph.
+
+            if (text.startsWith("Abstract ")) {
+                return StringUtils.abbreviate(text.substring("Abstract ".length()), "...", 255);
+            }
+            else if (text.equals("Abstract")) {
+                useNext = true;
+            }
+            else if (useNext) {
+                return StringUtils.abbreviate(text, "...", 255);
+            }
+
+            if (++cnt > 15) { // Don't scan the entire document
+                break;
+            }
+        }
+
+        // Fall back to the default specialization
+        return defaultSpecialization.getSummary(doc, Set.of());
+
+    }
+
+    /** Convert the provided PDF bytes into a HTML rendering that can be fed
+     * to the HTML processor.
+     */
+    Document convertPdfToHtml(byte[] pdfBytes) throws IOException {
+        try (var doc = Loader.loadPDF(pdfBytes)) {
+            String docMetaTitle = Objects.requireNonNullElse(doc.getDocumentInformation().getTitle(), "");
+
+            var stripper = new HeadingAwarePDFTextStripper();
+            stripper.setStartPage(1);
+            stripper.setSortByPosition(true);
+            stripper.setWordSeparator(" ");
+
+            // Increase the tolerance for line spacing to deal better with paragraphs.
+            stripper.setDropThreshold(5f);
+
+            stripper.setPageStart("<div>");
+            stripper.setParagraphStart("<p>");
+            stripper.setParagraphEnd("</p>\n");
+            stripper.setPageEnd("</div>\n");
+            stripper.setHeadingStart("<h1>");
+            stripper.setHeadingEnd("</h1>\n");
+            stripper.setLineSeparator("\n");
+
+            String text = stripper.getText(doc);
+
+            StringBuilder htmlBuilder = new StringBuilder(text.length() + 1024);
+            htmlBuilder.append("<html><body>")
+                    .append(text)
+                    .append("</body></html>");
+
+            var parsed = Jsoup.parse(htmlBuilder.toString());
+
+            repairDOM(parsed);
+
+            for (var heading : parsed.getElementsByTag("h1")) {
+                String headingText = heading.text();
+                if (headingText.length() > 2) {
+                    parsed.title(headingText);
+                    break;
+                }
+            }
+
+
+            if (parsed.title().isEmpty()) {
+                // Prefer setting the title to the first paragraph in the
+                // document, as this is almost always correct.  Otherwise,
+                // we fall back on the metadata title, which is almost always
+                // useless
+
+                var firstP = parsed.getElementsByTag("p").first();
+                if (firstP != null) parsed.title(firstP.text());
+                else parsed.title(docMetaTitle);
+            }
+            return parsed;
+        }
+
+
+    }
+
+    /** Repair the DOM to remove some common issues with PDF conversion,
+     * including empty paragraphs, and multiline headers that are split into multiple
+     * conescutive h1 tags.
+     */
+    private void repairDOM(Document parsed) {
+
+        // <p><h1>...</h1></p> -> <h1>...</h1>
+        parsed.getElementsByTag("h1").forEach(h1 -> {
+            var parent = h1.parent();
+            if (parent == null || !"p".equals(parent.tagName())) {
+                return;
+            }
+
+            if (parent.childrenSize() == 1) {
+                parent.replaceWith(h1);
+            }
+        });
+
+        // Remove empty <p> tags
+        parsed.getElementsByTag("p").forEach(p -> {
+            if (p.childrenSize() == 0 && !p.hasText()) {
+                p.remove();
+            }
+        });
+
+        // <h1>...</h1><h1>...</h1> -> <h1>...</h1>
+        parsed.getElementsByTag("h1").forEach(h1 -> {
+            var nextSibling = h1.nextElementSibling();
+            if (nextSibling == null || !"h1".equals(nextSibling.tagName())) {
+                return; // Short-circuit to avoid unnecessary work
+            }
+
+            StringJoiner joiner = new StringJoiner(" ");
+            joiner.add(h1.text());
+
+            for (var sibling : h1.nextElementSiblings()) {
+                if (!"h1".equals(sibling.tagName()))
+                    break;
+                joiner.add(sibling.text());
+                sibling.remove();
+            }
+
+            h1.text(joiner.toString());
+        });
+
+    }
+
+}
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java
@@ -13,10 +13,10 @@ import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.filter.LanguageFilter;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import org.apache.commons.lang3.StringUtils;
@@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP

        ret.length = documentBody.length();

-        ret.standard = HtmlStandard.PLAIN;
+        ret.format = DocumentFormat.PLAIN;
        ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);

        ret.quality = -1;
@@ -113,7 +113,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
                .addPubDate(pubDate)
                .addUrl(url)
                .addFeatures(ret.features)
-                .addFormat(ret.standard)
+                .addFormat(ret.format)
                .build();

        words.addAllSyntheticTerms(tagWords);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java
@@ -1,12 +1,13 @@
 package nu.marginalia.converting.processor.pubdate;

-import nu.marginalia.model.html.HtmlStandard;
+import nu.marginalia.model.DocumentFormat;

 public class PubDateFromHtmlStandard {
    /** Used to bias pub date heuristics */
-    public static int blindGuess(HtmlStandard standard) {
-        return switch (standard) {
+    public static int blindGuess(DocumentFormat format) {
+        return switch (format) {
            case PLAIN -> 1993;
+            case PDF -> 2010;
            case HTML123 -> 1997;
            case HTML4, XHTML -> 2006;
            case HTML5 -> 2018;
@@ -21,8 +22,8 @@ public class PubDateFromHtmlStandard {
     * Discovering publication year involves a lot of guesswork, this helps
     * keep the guesses relatively sane.
     */
-    public static boolean isGuessPlausible(HtmlStandard standard, int year) {
-        switch (standard) {
+    public static boolean isGuessPlausible(DocumentFormat format, int year) {
+        switch (format) {
            case HTML123:
                return year <= 2000;
            case XHTML:
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java
@@ -1,14 +1,14 @@
 package nu.marginalia.converting.processor.pubdate;

 import nu.marginalia.converting.model.DocumentHeaders;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;

 public interface PubDateHeuristic {

-    Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
+    Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard);
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java
@@ -1,7 +1,7 @@
 package nu.marginalia.converting.processor.pubdate;

+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;

 import java.time.DateTimeException;
 import java.time.LocalDate;
@@ -26,7 +26,7 @@ public class PubDateParser {
                .filter(PubDateParser::validateDate);
    }

-    public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
+    public static Optional<PubDate> attemptParseDate(String date, DocumentFormat standard) {
        return Optional.ofNullable(date)
                .filter(str -> str.length() >= 4 && str.length() < 32)
                .flatMap(str ->
@@ -81,7 +81,7 @@ public class PubDateParser {
    }


-    public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
+    public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, DocumentFormat standard) {
        int guess = PubDateFromHtmlStandard.blindGuess(standard);

        var matcher = yearPattern.matcher(maybe);
@@ -135,7 +135,7 @@ public class PubDateParser {
        return (max + min) / 2;
    }

-    public static int guessYear(HtmlStandard standard) {
+    public static int guessYear(DocumentFormat standard) {
        // Create some jitter to avoid having documents piling up in the same four years
        // as this would make searching in those years disproportionately useless

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.pubdate;

 import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.heuristic.*;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.ArrayList;
@@ -38,7 +38,7 @@ public class PubDateSniffer {
        heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
    }

-    public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
+    public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard, boolean runExpensive) {
        final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;

        for (var heuristic : heuristics) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -19,7 +19,7 @@ import java.util.Optional;
 public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        if (effortLevel == PubDateEffortLevel.LOW)
            return Optional.empty();

@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {

    private static class DateExtractingNodeVisitorPass implements NodeFilter {
        public PubDate pubDate;
-        private final HtmlStandard htmlStandard;
+        private final DocumentFormat htmlStandard;

-        private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
+        private DateExtractingNodeVisitorPass(DocumentFormat htmlStandard) {
            this.htmlStandard = htmlStandard;
        }

@@ -135,7 +135,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
        }

        private void parse(String text) {
-            if (htmlStandard == HtmlStandard.UNKNOWN) {
+            if (htmlStandard == DocumentFormat.UNKNOWN) {
                PubDateParser
                        .dateFromHighestYearLookingSubstring(text)
                        .ifPresent(this::setPubDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
@@ -5,9 +5,9 @@ import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;
@@ -19,7 +19,7 @@ import java.util.Optional;
 public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        if (effortLevel == PubDateEffortLevel.LOW)
            return Optional.empty();

@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {

    private static class DateExtractingNodeVisitor implements NodeFilter {
        public PubDate pubDate;
-        private final HtmlStandard htmlStandard;
+        private final DocumentFormat htmlStandard;

-        private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
+        private DateExtractingNodeVisitor(DocumentFormat htmlStandard) {
            this.htmlStandard = htmlStandard;
        }

@@ -73,7 +73,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
        }

        private void parse(String text) {
-            if (htmlStandard == HtmlStandard.UNKNOWN) {
+            if (htmlStandard == DocumentFormat.UNKNOWN) {
                PubDateParser
                        .dateFromHighestYearLookingSubstring(text)
                        .ifPresent(this::setPubDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,8 +14,8 @@ import java.util.Optional;
 public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
-        if (htmlStandard == HtmlStandard.UNKNOWN)
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
+        if (htmlStandard == DocumentFormat.UNKNOWN)
            return Optional.empty();

        return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // HTML5, alternative approach
        for (var tag : document.select("time")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // HTML5
        for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
            if (maybeDate.isPresent()) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java
@@ -8,9 +8,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Collections;
@@ -21,7 +21,7 @@ import java.util.Optional;
 public class PubDateHeuristicJSONLD implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("script[type=\"application/ld+json\"]")) {
            var maybeDate = parseLdJson(tag.data())
                    .flatMap(PubDateParser::attemptParseDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.List;
@@ -15,7 +15,7 @@ import java.util.Optional;
 public class PubDateHeuristicLastModified implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        List<String> lastModified = headers.get("last-modified");
        if (lastModified.isEmpty())
            return Optional.empty();
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicMicrodata implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {

        for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicOpenGraph implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // OG
        for (var tag : document.select("meta[property=\"article:published_time\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicRDFaTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("meta[property=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
            if (maybeDate.isPresent()) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -21,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
    private static final int MIN_URL_PATTERN_YEAR = 2000;

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        final String urlString = url.path;

        var matcher = yearUrlPattern.matcher(urlString);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -19,7 +19,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {

    @Override
    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
-                                   Document document, HtmlStandard htmlStandard) {
+                                   Document document, DocumentFormat htmlStandard) {
        final String urlString = url.path;

        var matcher = yearUrlPattern.matcher(urlString);
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
@@ -8,12 +8,12 @@ import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.DocumentClass;
 import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
 import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawl.UrlIndexingState;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import nu.marginalia.model.idx.WordFlags;
@@ -83,7 +83,7 @@ public class SideloaderProcessing {
            // that we can't get from the sideloaded data since it's
            // so stripped down

-            ret.details.standard = HtmlStandard.HTML5;
+            ret.details.format = DocumentFormat.HTML5;
            ret.details.pubYear = pubYear;
            ret.details.features.add(HtmlFeature.JS);
            ret.details.features.add(HtmlFeature.TRACKING);
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
@@ -9,13 +9,13 @@ import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
 import nu.marginalia.keyword.DocumentKeywordExtractor;
 import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.DomainIndexingState;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawl.UrlIndexingState;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import nu.marginalia.model.idx.WordFlags;
@@ -165,7 +165,7 @@ public class StackexchangeSideloader implements SideloadSource {
            ret.details.description = StringUtils.truncate(doc.body().text(), 255);
            ret.details.length = 128;

-            ret.details.standard = HtmlStandard.HTML5;
+            ret.details.format = DocumentFormat.HTML5;
            ret.details.linksExternal = List.of();
            ret.details.linksInternal = List.of();
            ret.state = UrlIndexingState.OK;
--- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
@@ -124,7 +124,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
                    document.details.title,
                    document.details.description,
                    HtmlFeature.encode(document.details.features),
-                    document.details.standard.name(),
+                    document.details.format.name(),
                    document.details.length,
                    document.details.hashCode,
                    (float) document.details.quality,
--- a/code/processes/converting-process/java/org/apache/pdfbox/text/HeadingAwarePDFTextStripper.java
+++ b/code/processes/converting-process/java/org/apache/pdfbox/text/HeadingAwarePDFTextStripper.java
--- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java
@@ -6,6 +6,7 @@ import com.google.inject.Injector;
 import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.io.SerializableCrawlDataStream;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.crawl.DomainIndexingState;
 import nu.marginalia.model.crawl.PubDate;
@@ -13,7 +14,6 @@ import nu.marginalia.model.crawl.UrlIndexingState;
 import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
-import nu.marginalia.model.html.HtmlStandard;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
@@ -91,7 +91,7 @@ public class ConvertingIntegrationTest {

            assertTrue(details.title.length() > 4);
            assertTrue(details.description.length() > 4);
-            assertEquals(HtmlStandard.HTML5, details.standard);
+            assertEquals(DocumentFormat.HTML5, details.format);

        }
    }
@@ -125,7 +125,7 @@ public class ConvertingIntegrationTest {
            assertTrue(details.metadata.size() > 0);
            assertTrue(details.title.length() > 4);
            assertTrue(details.description.length() > 4);
-            assertEquals(HtmlStandard.HTML5, details.standard);
+            assertEquals(DocumentFormat.HTML5, details.format);
        }
    }

--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPluginTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPluginTest.java
@@ -0,0 +1,95 @@
+package nu.marginalia.converting.processor.plugin;
+
+import nu.marginalia.WmsaHome;
+import nu.marginalia.converting.processor.DocumentClass;
+import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
+import nu.marginalia.converting.processor.logic.TitleExtractor;
+import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
+import nu.marginalia.converting.processor.summary.SummaryExtractor;
+import nu.marginalia.converting.processor.summary.heuristic.*;
+import nu.marginalia.keyword.DocumentKeywordExtractor;
+import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.language.filter.LanguageFilter;
+import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.crawldata.CrawledDocument;
+import nu.marginalia.term_frequency_dict.TermFrequencyDict;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.Instant;
+
+@Tag("flaky")
+class PdfDocumentProcessorPluginTest {
+    static PdfDocumentProcessorPlugin plugin;
+
+    @BeforeAll
+    static void setUpBeforeClass() throws Exception {
+        var lm = WmsaHome.getLanguageModels();
+        plugin = new PdfDocumentProcessorPlugin(255,
+                new LanguageFilter(lm),
+                new ThreadLocalSentenceExtractorProvider(lm),
+                new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
+                new DocumentLengthLogic(100),
+                new DefaultSpecialization(new SummaryExtractor(
+                        255,
+                        new DomFilterHeuristic(255),
+                        new TagDensityHeuristic(255),
+                        new OpenGraphDescriptionHeuristic(),
+                        new MetaDescriptionHeuristic(),
+                        new FallbackHeuristic()
+                ),
+                        new TitleExtractor(255)
+                        ));
+    }
+    public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
+        var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
+        return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
+    }
+
+    public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(Path file) throws Exception {
+        return testPdfFile(Files.readAllBytes(file));
+    }
+
+    private byte[] downloadPDF(String url) throws IOException, URISyntaxException {
+        HttpURLConnection conn = (HttpURLConnection) new URI(url).toURL().openConnection();
+        try {
+            return conn.getInputStream().readAllBytes();
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        } finally {
+            conn.disconnect();
+        }
+    }
+
+
+    @Disabled
+    @Test
+    void testingTool() throws Exception {
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample2.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample3.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample4.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample5.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample6.pdf")).details().title);
+    }
+
+    @Disabled
+    @Test
+    void testingTool2() throws Exception {
+        System.out.println(plugin.convertPdfToHtml(Files.readAllBytes(Path.of("/home/st_work/Work/sample6.pdf"))));
+    }
+
+    @Test
+    void testMarginaliaSample() throws Exception {
+        var doc = plugin.convertPdfToHtml(downloadPDF("https://www.marginalia.nu/junk/test.pdf"));
+        System.out.println(doc.html());
+    }
+}
--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java
@@ -3,8 +3,8 @@ package nu.marginalia.converting.processor.pubdate;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.Jsoup;
 import org.junit.jupiter.api.Test;

@@ -74,7 +74,7 @@ class PubDateSnifferTest {
                        <time pubdate="pubdate" datetime="2022-08-24">time</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -90,7 +90,7 @@ class PubDateSnifferTest {
                        <time>2022-08-24</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -106,7 +106,7 @@ class PubDateSnifferTest {
                        <time class="published" datetime="July 13, 2006">July 13, 2006</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals(2006, ret.year());
@@ -116,14 +116,14 @@ class PubDateSnifferTest {
    public void testProblemCases() throws IOException, URISyntaxException {
        var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
                new EdgeUrl("https://www.example.com/"),
-                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
+                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), DocumentFormat.HTML5, true);

        assertFalse(ret.isEmpty());
        assertEquals(2006, ret.year());

        ret = dateSniffer.getPubDate(new DocumentHeaders(""),
                new EdgeUrl("https://www.example.com/"),
-                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
+                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), DocumentFormat.XHTML, true);

        assertFalse(ret.isEmpty());
        assertEquals(2010, ret.year());
@@ -146,7 +146,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <meta itemprop="datePublished" content="2022-08-24" />
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -160,7 +160,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <meta property="datePublished" content="2022-08-24" />
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -174,7 +174,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2004-08-24", ret.dateIso8601());
@@ -188,7 +188,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2016-12-27", ret.dateIso8601());
@@ -202,7 +202,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <title>No date in the HTML</title>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
@@ -217,7 +217,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <title>No date in the HTML</title>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-02-03", ret.dateIso8601());
@@ -232,7 +232,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <p>Published 2003, updated 2022</p>
-                        """), HtmlStandard.HTML5, true);
+                        """), DocumentFormat.HTML5, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
@@ -258,7 +258,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <div style="float: left;">&nbsp;<b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a  href="./viewtopic.php?p=34580&amp;sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span>&nbsp;<b>Posted:</b> Sun Oct 03, 2010 5:37 pm&nbsp;</div>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
--- a/code/processes/crawling-process/build.gradle
+++ b/code/processes/crawling-process/build.gradle
@@ -67,8 +67,6 @@ dependencies {
    testImplementation libs.mockito
    testImplementation libs.wiremock

-
-
    testImplementation project(':code:processes:test-data')
 }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -66,6 +66,7 @@ public class CrawlerMain extends ProcessMainClass {
    private final DomainLocks domainLocks = new DomainLocks();

    private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
+
    private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();

    private final AtomicInteger tasksDone = new AtomicInteger(0);
@@ -263,17 +264,16 @@ public class CrawlerMain extends ProcessMainClass {
                if (workLog.isJobFinished(crawlSpec.domain))
                    continue;

-                var task = new CrawlTask(
-                        crawlSpec,
-                        anchorTagsSource,
-                        outputDir,
-                        warcArchiver,
-                        domainStateDb,
-                        workLog);
+                var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);

                // Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
                if (!trySubmitDeferredTask(task)) {
-                    // Otherwise add to the taskList for deferred execution
+
+                    // Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
+                    retryQueue.drainTo(taskList);
+                    taskList.removeIf(this::trySubmitDeferredTask);
+
+                    // Then add this new task to the retry queue
                    taskList.add(task);
                }
            }
@@ -289,10 +289,13 @@ public class CrawlerMain extends ProcessMainClass {

                if (hasTasks || hasRetryTasks || hasRunningTasks) {
                    retryQueue.drainTo(taskList);
+
+                    // Try to submit any tasks that are in the retry queue (this will block if the pool is full)
                    taskList.removeIf(this::trySubmitDeferredTask);
+
                    // Add a small pause here to avoid busy looping toward the end of the execution cycle when
                    // we might have no new viable tasks to run for hours on end
-                    TimeUnit.MILLISECONDS.sleep(50);
+                    TimeUnit.MILLISECONDS.sleep(5);
                } else {
                    // We have no tasks to run, and no tasks in the retry queue
                    // but we wait a bit to see if any new tasks come in via the retry queue
@@ -430,7 +433,7 @@ public class CrawlerMain extends ProcessMainClass {
        /** Best effort indicator whether we could start this now without getting stuck in
         * DomainLocks purgatory */
        public boolean canRun() {
-            return domainLocks.canLock(new EdgeDomain(domain));
+            return domainLocks.isLockableHint(new EdgeDomain(domain));
        }

        @Override
@@ -445,12 +448,15 @@ public class CrawlerMain extends ProcessMainClass {
            // We don't have a lock, so we can't run this task
            // we return to avoid blocking the pool for too long
            if (lock.isEmpty()) {
-                retryQueue.add(this);
+                pendingCrawlTasks.remove(domain);
+                retryQueue.put(this);
                return;
            }
            DomainLocks.DomainLock domainLock = lock.get();

            try (domainLock) {
+                Thread.currentThread().setName("crawling:" + domain);
+
                Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
                Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
                Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
@@ -482,7 +488,7 @@ public class CrawlerMain extends ProcessMainClass {
                    // (mostly a case when migrating from legacy->warc)
                    reference.delete();

-                    // Convert the WARC file to Parquet
+                    // Convert the WARC file to Slop
                    SlopCrawlDataRecord
                            .convertWarc(domain, userAgent, newWarcFile, slopFile);

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
@@ -19,11 +19,13 @@ public record ContentTags(String etag, String lastMod) {
    /** Paints the tags onto the request builder. */
    public void paint(HttpGet request) {

+        // Paint the ETag header if present,
+        // otherwise paint the Last-Modified header
+        // (but not both at the same time due to some servers not liking it)
+
        if (etag != null) {
            request.addHeader("If-None-Match", etag);
-        }
-
-        if (lastMod != null) {
+        } else if (lastMod != null) {
            request.addHeader("If-Modified-Since", lastMod);
        }
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -51,8 +51,10 @@ import javax.net.ssl.SSLException;
 import java.io.IOException;
 import java.net.SocketTimeoutException;
 import java.net.URISyntaxException;
+import java.net.UnknownHostException;
 import java.security.NoSuchAlgorithmException;
 import java.time.Duration;
+import java.time.Instant;
 import java.util.*;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
@@ -393,25 +395,31 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
            if (probeType == HttpFetcher.ProbeType.FULL) {
                try {
                    var probeResult = probeContentType(url, cookies, timer, contentTags);
-                    logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
+
                    switch (probeResult) {
                        case HttpFetcher.ContentTypeProbeResult.NoOp():
                            break; //
                        case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
+                            logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
                            url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
                            break;
                        case ContentTypeProbeResult.BadContentType badContentType:
                            warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
+                            logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
                            return new HttpFetchResult.ResultNone();
                        case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
+                            logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
                            warcRecorder.flagAsTimeout(url);
                            return new HttpFetchResult.ResultException(ex);
                        case ContentTypeProbeResult.Exception(Exception ex):
+                            logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
                            warcRecorder.flagAsError(url, ex);
                            return new HttpFetchResult.ResultException(ex);
                        case ContentTypeProbeResult.HttpError httpError:
+                            logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
                            return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
                        case ContentTypeProbeResult.Redirect redirect:
+                            logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
                            return new HttpFetchResult.ResultRedirect(redirect.location());
                    }
                } catch (Exception ex) {
@@ -430,27 +438,32 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
            contentTags.paint(request);

            try (var sl = new SendLock()) {
+                Instant start = Instant.now();
                HttpFetchResult result = warcRecorder.fetch(client, cookies, request);

+                Duration fetchDuration = Duration.between(start, Instant.now());
+
                if (result instanceof HttpFetchResult.ResultOk ok) {
                    if (ok.statusCode() == 304) {
-                        return new HttpFetchResult.Result304Raw();
+                        result = new HttpFetchResult.Result304Raw();
                    }
                }

                switch (result) {
-                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
+                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
                    case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {}  for {}", redirect.url(), url);
                    case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
-                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex());
+                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
                    case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
                    case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
                }
+
                return result;
            }
        }
        catch (Exception ex) {
-            ex.printStackTrace();
+            logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
+
            return new HttpFetchResult.ResultException(ex);
        }

@@ -623,14 +636,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {

    @Override
    public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
-        if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
-            return false;
-        }
-        if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
-            return false;
-        }
-
-        return executionCount <= 3;
+        return switch (exception) {
+            case SocketTimeoutException ste -> false;
+            case SSLException ssle -> false;
+            case UnknownHostException uhe -> false;
+            default -> executionCount <= 3;
+        };
    }

    @Override
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
@@ -57,6 +57,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
            return new ErrorBuffer();
        }

+        Instant start = Instant.now();
        InputStream is = null;
        try {
            is = entity.getContent();
@@ -71,8 +72,25 @@ public abstract class WarcInputBuffer implements AutoCloseable {
            }
        }
        finally {
+            // We're required to consume the stream to avoid leaking connections,
+            // but we also don't want to get stuck on slow or malicious connections
+            // forever, so we set a time limit on this phase and call abort() if it's exceeded.
            try {
-                is.skip(Long.MAX_VALUE);
+                while (is != null) {
+                    // Consume some data
+                    if (is.skip(65536) == 0) {
+                        // Note that skip may return 0 if the stream is empty
+                        // or for other unspecified reasons, so we need to check
+                        // with read() as well to determine if the stream is done
+                        if (is.read() == -1)
+                            is = null;
+                    }
+                    // Check if the time limit has been exceeded
+                    else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
+                        request.abort();
+                        is = null;
+                    }
+                }
            }
            catch (IOException e) {
                // Ignore the exception
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@@ -41,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
    static final int MAX_TIME = 30_000;

    /** Maximum (decompressed) size we'll save */
-    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
+    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);

    private final WarcWriter writer;
    private final Path warcFile;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
@@ -20,16 +20,17 @@ public class DomainLocks {
     * and may be held by another thread.  The caller is responsible for locking and  releasing the lock.
     */
    public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
-        var ret = new DomainLock(domain.toString(),
-                locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
-        ret.lock();
-        return ret;
+        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
+
+        sem.acquire();
+
+        return new DomainLock(sem);
    }

    public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
        if (sem.tryAcquire(1)) {
-            return Optional.of(new DomainLock(domain.toString(), sem));
+            return Optional.of(new DomainLock(sem));
        }
        else {
            // We don't have a lock, so we return an empty optional
@@ -42,23 +43,27 @@ public class DomainLocks {
            return new Semaphore(16);
        if (topDomain.equals("blogspot.com"))
            return new Semaphore(8);
-
+        if (topDomain.equals("tumblr.com"))
+            return new Semaphore(8);
        if (topDomain.equals("neocities.org"))
-            return new Semaphore(4);
+            return new Semaphore(8);
        if (topDomain.equals("github.io"))
-            return new Semaphore(4);
+            return new Semaphore(8);

+        // Substack really dislikes broad-scale crawlers, so we need to be careful
+        // to not get blocked.
        if (topDomain.equals("substack.com")) {
            return new Semaphore(1);
        }
-        if (topDomain.endsWith(".edu")) {
-            return new Semaphore(1);
-        }

        return new Semaphore(2);
    }

-    public boolean canLock(EdgeDomain domain) {
+    /** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
+     * (this is just a hint, and does not guarantee that the domain is actually lockable any time
+     * after this method returns true)
+     */
+    public boolean isLockableHint(EdgeDomain domain) {
        Semaphore sem = locks.get(domain.topDomain.toLowerCase());
        if (null == sem)
            return true;
@@ -67,25 +72,16 @@ public class DomainLocks {
    }

    public static class DomainLock implements AutoCloseable {
-        private final String domainName;
        private final Semaphore semaphore;

-        DomainLock(String domainName, Semaphore semaphore) {
-            this.domainName = domainName;
+        DomainLock(Semaphore semaphore) {
            this.semaphore = semaphore;
        }

-        // This method is called to lock the domain.  It will block until the lock is available.
-        private void lock() throws InterruptedException {
-            Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
-            semaphore.acquire();
-            Thread.currentThread().setName("crawling:" + domainName);
-        }
-
        @Override
        public void close() throws Exception {
            semaphore.release();
-            Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
+            Thread.currentThread().setName("[idle]");
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -74,7 +74,7 @@ public class CrawlerRevisitor {

            // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
            // unlikely to produce anything meaningful for us.
-            if (doc.httpStatus != 200)
+            if (doc.httpStatus != 200 && doc.httpStatus != 206)
                continue;
            if (!doc.hasBody())
                continue;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
@@ -58,7 +58,7 @@ public record DocumentWithReference(
        if (null == doc)
            return ContentTags.empty();

-        if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
+        if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
            return ContentTags.empty();

        String lastmod = doc.getLastModified();
--- a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -1,22 +1,32 @@
 package nu.marginalia;

+import org.apache.commons.lang3.StringUtils;
+
 import java.util.Set;

 public class ContentTypes {
    public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
            "application/xhtml",
            "text/html",
+            "text/markdown",
+            "text/x-markdown",
+            "application/pdf",
            "image/x-icon",
            "text/plain");

    public static boolean isAccepted(String contentTypeHeader) {
-        String lcHeader = contentTypeHeader.toLowerCase();
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
        for (var type : acceptedContentTypes) {
-            if (lcHeader.startsWith(type)) {
+            if (lcHeader.equals(type)) {
                return true;
            }
        }
        return false;
    }

+    public static boolean isBinary(String contentTypeHeader) {
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
+        return lcHeader.startsWith("application/pdf");
+    }
+
 }
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
            public boolean filter(String url, int status, String contentType) {
                String ctLc = contentType.toLowerCase();

+                // Permit all plain text content types
                if (ctLc.startsWith("text/"))
                    return true;
+                // PDF
+                else if (ctLc.startsWith("application/pdf"))
+                    return true;
                else if (ctLc.startsWith("x-marginalia/"))
                    return true;

--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;

 public class ContentTypeLogic {

-    private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
+    private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
    private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
    private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
    private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
            "application/rss+xml",
            "application/x-rss+xml",
            "application/rdf+xml",
+            "application/pdf",
            "x-rss+xml"
    );
    private boolean allowAllContentTypes = false;
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
    public boolean isUrlLikeBinary(EdgeUrl url) {
        String pathLowerCase = url.path.toLowerCase();

-        if (probableHtmlPattern.test(pathLowerCase))
+        if (probableGoodPattern.test(pathLowerCase))
            return false;

        return probableBinaryPattern.test(pathLowerCase);
--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -158,11 +158,12 @@ public record SlopCrawlDataRecord(String domain,
                        // and is used to store old responses from previous crawls; in this part of the logic
                        // we treat them the same as a normal response

-                        if (!filterResponse(uaString, response)) {
+                        var filterStatus = filterResponse(uaString, response);
+                        if (filterStatus.isRejected()) {
                            continue;
                        }

-                        slopWriter.write(domain, response);
+                        slopWriter.write(domain, filterStatus, response);
                    } else if (record instanceof WarcXEntityRefused refused) {
                        slopWriter.write(domain, refused);
                    } else if (record instanceof Warcinfo warcinfo) {
@@ -187,25 +188,35 @@ public record SlopCrawlDataRecord(String domain,
        }
    }

-
+    sealed interface ResponseFilterResult {
+        default boolean isRejected() { return false; }
+        record Accept() implements ResponseFilterResult {}
+        record AcceptWithContentType(String contentType) implements ResponseFilterResult {}
+        record AcceptIfPlainText(String contentType) implements ResponseFilterResult {}
+        record Reject() implements ResponseFilterResult {
+            @Override
+            public boolean isRejected() { return true; }
+        }
+    }

    /** Return true if the WarcResponse should be excluded from conversion */
-    private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
+    private static ResponseFilterResult filterResponse(String uaString, WarcResponse response) throws IOException {

        // We don't want to store robots.txt files, as they are not
        // interesting for the analysis we want to do.  This is important
        // since txt-files in general are interesting, and we don't want to
        // exclude them as a class.

-        if (response.targetURI().getPath().equals("/robots.txt")) {
-            return false;
+        String uriPath = response.targetURI().getPath();
+        if (uriPath.equals("/robots.txt")) {
+            return new ResponseFilterResult.Reject();
        }

        var headers = response.http().headers();
        var robotsTags = headers.all("X-Robots-Tag");

        if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
-            return false;
+            return new ResponseFilterResult.Reject();
        }

        // Strip out responses with content types we aren't interested in
@@ -213,10 +224,29 @@ public record SlopCrawlDataRecord(String domain,
        String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();

        if (!ContentTypes.isAccepted(contentType)) {
-            return false;
+            String contentTypeWithoutParams = StringUtils.substringBefore(contentType, ";");
+
+            // Some servers don't understand what a markdown file is
+            if (contentTypeWithoutParams.equals("application/octet-stream")) {
+                if (uriPath.endsWith(".md")) {
+                    // This is a markdown file, which we want to keep
+                    return new ResponseFilterResult.AcceptIfPlainText("text/markdown");
+                }
+                else if (uriPath.endsWith(".pdf")) {
+                    // This is a text file, which we want to keep
+                    return new ResponseFilterResult.AcceptWithContentType("application/pdf");
+                }
            }

-        return true;
+            return new ResponseFilterResult.Reject();
+        }
+
+        // If the format is binary, we don't want to translate it if the response is truncated
+        if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
+            return new ResponseFilterResult.Reject();
+        }
+
+        return new ResponseFilterResult.Accept();
    }

    /**  Check X-Robots-Tag header tag to see if we are allowed to index this page.
@@ -272,7 +302,8 @@ public record SlopCrawlDataRecord(String domain,
        try (var table = new SlopTable(path)) {
            ShortColumn.Reader statusReader = statusColumn.open(table);
            while (statusReader.hasRemaining()) {
-                if (statusReader.get() == 200) {
+                int status = statusReader.get();
+                if (status == 200 || status == 206) {
                    cnt++;
                }
            }
@@ -318,7 +349,7 @@ public record SlopCrawlDataRecord(String domain,
            headerColumnWriter.put(record.headers);
        }

-        public void write(String domain, WarcResponse response) throws IOException {
+        public void write(String domain, ResponseFilterResult filterStatus, WarcResponse response) throws IOException {

            HttpFetchResult result = HttpFetchResult.importWarc(response);
            if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
@@ -341,6 +372,21 @@ public record SlopCrawlDataRecord(String domain,
                contentType = "";
            }

+            switch (filterStatus) {
+                case ResponseFilterResult.AcceptWithContentType(String ct) -> contentType = ct;
+                case ResponseFilterResult.AcceptIfPlainText(String ct) -> {
+                    try {
+                        // Parse the body as UTF-8
+                        new String(bodyBytes, StandardCharsets.UTF_8);
+                        contentType = ct;
+                    }
+                    catch (RuntimeException ex) { // UTF-8 decoding failed
+                        return;
+                    }
+                }
+                default -> {}
+            }
+
            boolean hasCookies = false;

            String headersStr;
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
@@ -40,6 +40,8 @@ class HttpFetcherImplFetchTest {
    private static EdgeUrl badHttpStatusUrl;
    private static EdgeUrl keepAliveUrl;

+    private static EdgeUrl pdfUrl;
+
    @BeforeAll
    public static void setupAll() throws URISyntaxException {
        wireMockServer =
@@ -133,6 +135,13 @@ class HttpFetcherImplFetchTest {
                        ));


+        pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "application/pdf")
+                        .withStatus(200)
+                        .withBody("Hello World")));
+
        wireMockServer.start();

    }
@@ -352,6 +361,14 @@ class HttpFetcherImplFetchTest {
        Assertions.assertTrue(result.isOk());
    }

+    @Test
+    public void testPdf() {
+        var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+    }
+

    private List<WarcRecord> getWarcRecords() throws IOException {
        List<WarcRecord> records = new ArrayList<>();
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@@ -4,9 +4,9 @@ import nu.marginalia.UserAgent;
 import nu.marginalia.crawl.fetcher.ContentTags;
 import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
-import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
+import nu.marginalia.slop.SlopCrawlDataRecord;
 import org.apache.hc.client5.http.classic.HttpClient;
 import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
@@ -24,13 +24,14 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.NoSuchAlgorithmException;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;

 import static org.junit.jupiter.api.Assertions.assertEquals;

 class WarcRecorderTest {
    Path fileNameWarc;
-    Path fileNameParquet;
+    Path fileNameSlop;
    WarcRecorder client;

    HttpClient httpClient;
@@ -39,7 +40,7 @@ class WarcRecorderTest {
        httpClient = HttpClients.createDefault();

        fileNameWarc = Files.createTempFile("test", ".warc");
-        fileNameParquet = Files.createTempFile("test", ".parquet");
+        fileNameSlop = Files.createTempFile("test", ".slop.zip");

        client = new WarcRecorder(fileNameWarc);
    }
@@ -159,17 +160,28 @@ class WarcRecorderTest {

        client.fetch(httpClient, new DomainCookies(), request3);

-        CrawledDocumentParquetRecordFileWriter.convertWarc(
+        HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
+        request4.addHeader("User-agent", "test.marginalia.nu");
+        request4.addHeader("Accept-Encoding", "gzip");
+
+        client.fetch(httpClient, new DomainCookies(), request4);
+
+        SlopCrawlDataRecord.convertWarc(
                "www.marginalia.nu",
                new UserAgent("test", "test"),
                fileNameWarc,
-                fileNameParquet);
+                fileNameSlop);

-        var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
-        assertEquals(2, urls.size());
+        List<String> urls;
+        try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
+            urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
+        }
+
+        assertEquals(3, urls.size());
        assertEquals("https://www.marginalia.nu/", urls.get(0));
        assertEquals("https://www.marginalia.nu/log/", urls.get(1));
        // sanic.jpg gets filtered out for its bad mime type
+        assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));

    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -117,6 +117,100 @@ class CrawlerRetreiverTest {
        }
    }

+
+    @Test
+    public void verifyFileFormatSupport() throws IOException {
+        List<String> urls = List.of(
+                "https://www.marginalia.nu/junk/test.pdf",
+                "https://www.marginalia.nu/junk/test.md"
+        );
+
+        var specs = CrawlerMain.CrawlSpecRecord
+                .builder()
+                .crawlDepth(5)
+                .domain("www.marginalia.nu")
+                .urls(urls)
+                .build();
+        Path tempFile = null;
+        Path slopFile = null;
+        try {
+            tempFile = Files.createTempFile("crawling-process", "warc");
+            slopFile = Files.createTempFile("crawling-process", ".slop.zip");
+
+            doCrawl(tempFile, specs);
+
+            Set<String> requests = new HashSet<>();
+            Set<String> responses = new HashSet<>();
+
+            // Inspect the WARC file
+            try (var reader = new WarcReader(tempFile)) {
+                reader.forEach(record -> {
+                    if (record instanceof WarcRequest req) {
+                        requests.add(req.target());
+                        System.out.println(req.type() + ":" + req.target());
+                    }
+                    else if (record instanceof WarcResponse rsp) {
+                        responses.add(rsp.target());
+                        try {
+                            System.out.println(rsp.type() + ":" + rsp.target() + ":" + rsp.http().contentType());
+                        } catch (IOException e) {
+                            throw new RuntimeException(e);
+                        }
+                    }
+                    else {
+                        System.out.println(record.type());
+                    }
+                });
+            }
+
+            for (var url : urls) {
+                assertTrue(requests.contains(url), "Should have requested " + url);
+            }
+            assertEquals(requests, responses);
+
+            // Convert the WARC file to a Slop file
+            SlopCrawlDataRecord
+                    .convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
+
+            CrawledDomain domain = null;
+            Map<String, CrawledDocument> documents = new HashMap<>();
+
+            // Extract the contents of the Slop file
+            try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
+                while (stream.hasNext()) {
+                    var doc = stream.next();
+                    if (doc instanceof CrawledDomain dr) {
+                        assertNull(domain);
+                        domain = dr;
+                    }
+                    else if (doc instanceof CrawledDocument dc) {
+                        System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
+                        documents.put(dc.url, dc);
+                    }
+                }
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+
+            for (var url : urls) {
+                // Verify we have the downloaded files in the Slop file
+                assertNotNull(domain);
+                var fetchedDoc = documents.get(url);
+                assertNotNull(fetchedDoc, "Should have a document for " + url);
+                assertEquals(url, fetchedDoc.url);
+                assertTrue(fetchedDoc.httpStatus == 200 || fetchedDoc.httpStatus == 206, "Should be 200 or 206 for " + url);
+                assertTrue(fetchedDoc.documentBodyBytes.length > 32, "Should have a body for " + url);
+            }
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        } finally {
+            if (tempFile != null)
+                Files.deleteIfExists(tempFile);
+            if (slopFile != null)
+                Files.deleteIfExists(slopFile);
+        }
+    }
+
    @Test
    public void testWarcOutputNoKnownUrls() throws IOException {
        var specs = CrawlerMain.CrawlSpecRecord
--- a/code/processes/export-task-process/build.gradle
+++ b/code/processes/export-task-process/build.gradle
@@ -53,6 +53,8 @@ dependencies {
    implementation libs.commons.compress
    implementation libs.commons.codec
    implementation libs.jsoup
+    implementation libs.slop
+    implementation libs.jwarc



--- a/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
+++ b/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
@@ -1,13 +1,18 @@
 package nu.marginalia.extractor;

 import com.google.inject.Inject;
+import nu.marginalia.process.control.ProcessHeartbeat;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.process.log.WorkLogEntry;
+import nu.marginalia.slop.SlopCrawlDataRecord;
+import nu.marginalia.slop.SlopTablePacker;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageId;
 import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
 import org.apache.commons.compress.utils.IOUtils;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;

 import java.io.IOException;
 import java.nio.file.Files;
@@ -16,18 +21,19 @@ import java.nio.file.StandardCopyOption;
 import java.nio.file.StandardOpenOption;
 import java.nio.file.attribute.PosixFilePermissions;
 import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;

 public class SampleDataExporter {
    private final FileStorageService storageService;
+    private final ProcessHeartbeat processHeartbeat;

    @Inject
-    public SampleDataExporter(FileStorageService storageService) {
+    public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
        this.storageService = storageService;
+        this.processHeartbeat = processHeartbeat;
    }
-    public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException {
+
+    public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
        FileStorage destStorage = storageService.getStorage(destId);
        Path inputDir = storageService.getStorage(crawlId).asPath();

@@ -54,11 +60,6 @@ public class SampleDataExporter {

        Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
-        try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
-            for (var item : entriesAll) {
-                bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
-            }
-        }

        Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
@@ -67,13 +68,37 @@ public class SampleDataExporter {
        var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));

-        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
-            for (var item : entriesAll) {
+        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
+             var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+             var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
+        ) {
+            for (var item : hb.wrap("Scanning", entriesAll)) {
                Path crawlDataPath = inputDir.resolve(item.relPath());
                if (!Files.exists(crawlDataPath)) continue;

+                if (StringUtils.isBlank(ctFilter)) {
                    addFileToTar(stream, crawlDataPath, item.relPath());
+                    logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
                }
+                else /* filter != null */ {
+                    Path filteredData = null;
+                    try {
+                        filteredData = filterEntries(crawlDataPath, ctFilter);
+                        addFileToTar(stream, filteredData, item.relPath());
+                        logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
+                    }
+                    catch (NoSuchElementException ex) {
+                        // Ignore
+                    }
+                    finally {
+                        if (filteredData != null) {
+                            Files.deleteIfExists(filteredData);
+                        }
+                    }
+                }
+            }
+
+            logWriter.flush();

            addFileToTar(stream, newCrawlerLogFile, "crawler.log");
            addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
@@ -86,6 +111,56 @@ public class SampleDataExporter {
        Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
    }

+    /** Filters the entries in the crawl data file based on the content type. */
+    private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
+        Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
+        Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
+
+        // We may have debris from a previous run, so let's clean it up
+        if (Files.isDirectory(tempDir)) {
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
+        Files.createDirectory(tempDir);
+
+        boolean wroteEntry = false;
+
+        try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
+             var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
+                 @Override
+                 public boolean filter(String url, int status, String contentType) {
+                     return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
+                                || contentType.startsWith("x-marginalia/"); // metadata records
+                 }
+             }
+        ) {
+
+            while (reader.hasRemaining()) {
+                var entry = reader.get();
+                writer.write(entry);
+
+                wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
+            }
+        }
+        catch (Exception ex) {
+            FileUtils.deleteDirectory(tempDir.toFile());
+            throw ex;
+        }
+
+        try {
+            if (!wroteEntry) {
+                throw new NoSuchElementException("No relevant entries");
+            }
+
+            SlopTablePacker.packToSlopZip(tempDir, tempFile);
+        }
+        finally {
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
+
+
+        return tempFile;
+    }
+
    private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
        var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
        entry.setSize(Files.size(file));
--- a/code/processes/export-task-process/java/nu/marginalia/task/ExportTasksMain.java
+++ b/code/processes/export-task-process/java/nu/marginalia/task/ExportTasksMain.java
@@ -92,7 +92,7 @@ public class ExportTasksMain extends ProcessMainClass {
                    termFrequencyExporter.export(request.crawlId, request.destId);
                    break;
                case SAMPLE_DATA:
-                    sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name);
+                    sampleDataExporter.export(request.crawlId, request.destId, request.size, request.ctFilter, request.name);
                    break;
                case ADJACENCIES:
                    websiteAdjacenciesCalculator.export();
--- a/code/processes/process-mq-api/java/nu/marginalia/mqapi/tasks/ExportTaskRequest.java
+++ b/code/processes/process-mq-api/java/nu/marginalia/mqapi/tasks/ExportTaskRequest.java
@@ -16,6 +16,7 @@ public class ExportTaskRequest {
    public FileStorageId destId;
    public int size;
    public String name;
+    public String ctFilter;

    public ExportTaskRequest(Task task) {
        this.task = task;
@@ -42,12 +43,13 @@ public class ExportTaskRequest {
        return request;
    }

-    public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, int size, String name) {
+    public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, String ctFilter, int size, String name) {
        ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA);
        request.crawlId = crawlId;
        request.destId = destId;
        request.size = size;
        request.name = name;
+        request.ctFilter = ctFilter;
        return request;
    }

--- a/code/services-application/api-service/build.gradle
+++ b/code/services-application/api-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.4'
+    id 'com.google.cloud.tools.jib' version '3.4.5'
 }

 java {
--- a/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java
+++ b/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java
@@ -90,6 +90,7 @@ public class ApiSearchOperator {
                url.getTitle(),
                url.getDescription(),
                sanitizeNaN(url.rankingScore, -100),
+                url.getShortFormat(),
                details
        );
    }
--- a/code/services-application/api-service/java/nu/marginalia/api/model/ApiSearchResult.java
+++ b/code/services-application/api-service/java/nu/marginalia/api/model/ApiSearchResult.java
@@ -8,14 +8,16 @@ public class ApiSearchResult {
    public String title;
    public String description;
    public double quality;
+    public String format; // "pdf", "html", "text", etc.

    public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();

-    public ApiSearchResult(String url, String title, String description, double quality, List<List<ApiSearchResultQueryDetails>> details) {
+    public ApiSearchResult(String url, String title, String description, double quality, String format, List<List<ApiSearchResultQueryDetails>> details) {
        this.url = url;
        this.title = title;
        this.description = description;
        this.quality = quality;
+        this.format = format;
        this.details = details;
    }

--- a/code/services-application/dating-service/build.gradle
+++ b/code/services-application/dating-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.4'
+    id 'com.google.cloud.tools.jib' version '3.4.5'
 }

 application {
--- a/code/services-application/explorer-service/build.gradle
+++ b/code/services-application/explorer-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.4'
+    id 'com.google.cloud.tools.jib' version '3.4.5'
 }

 application {
--- a/code/services-application/search-service-legacy/build.gradle
+++ b/code/services-application/search-service-legacy/build.gradle
@@ -5,7 +5,7 @@ plugins {
    id 'application'
    id 'jvm-test-suite'

-    id 'com.google.cloud.tools.jib' version '3.4.4'
+    id 'com.google.cloud.tools.jib' version '3.4.5'
 }

 application {
--- a/code/services-application/search-service-legacy/java/nu/marginalia/search/model/UrlDetails.java
+++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/UrlDetails.java
@@ -73,6 +73,8 @@ public class UrlDetails implements Comparable<UrlDetails> {
                return "HTML 5";
            case "PLAIN":
                return "Plain Text";
+            case "PDF":
+                return "PDF";
            default:
                return "?";
        }
--- a/code/services-application/search-service/build.gradle
+++ b/code/services-application/search-service/build.gradle
@@ -3,7 +3,7 @@ plugins {
    id 'application'
    id 'jvm-test-suite'
    id 'gg.jte.gradle' version '3.1.15'
-    id 'com.google.cloud.tools.jib' version '3.4.4'
+    id 'com.google.cloud.tools.jib' version '3.4.5'
 }

 application {
--- a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java
@@ -112,6 +112,13 @@ public class SearchOperator {
                .selectStrategy(queryResponse)
                .clusterResults(queryResults, 25);

+        if (queryParams.humanQuery().equals("slackware linux")) {
+            logger.info("Query response: {}", queryResponse.results().subList(0, 5));
+            logger.info("Query results: {}", queryResults.subList(0, 5));
+            logger.info("Clustered results: {}", clusteredResults.subList(0, 5));
+        }
+
+
        // Log the query and results

        logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
--- a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java
@@ -17,7 +17,7 @@ public class SearchQueryParamFactory {
    static final RpcQueryLimits defaultLimits = RpcQueryLimits.newBuilder()
            .setResultsTotal(100)
            .setResultsByDomain(5)
-            .setTimeoutMs(200)
+            .setTimeoutMs(250)
            .setFetchSize(8192)
            .build();

--- a/code/services-application/search-service/java/nu/marginalia/search/SearchResultClusterer.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/SearchResultClusterer.java
@@ -23,7 +23,7 @@ public class SearchResultClusterer {
    }

    /** No clustering, just return the results as is */
-    private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
+    public static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
        if (results.isEmpty())
            return List.of();

--- a/code/services-application/search-service/java/nu/marginalia/search/SearchService.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/SearchService.java
@@ -85,7 +85,6 @@ public class SearchService extends JoobyService {
        String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
        jooby.get("/site/{domain}/favicon", ctx -> {
            String domain = ctx.path("domain").value();
-            logger.info("Finding icon for domain {}", domain);
            try {
                DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
                var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());
--- a/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java
@@ -78,6 +78,8 @@ public class UrlDetails implements Comparable<UrlDetails> {
                return "HTML 5";
            case "PLAIN":
                return "Plain Text";
+            case "PDF":
+                return "PDF";
            default:
                return "?";
        }
@@ -92,13 +94,24 @@ public class UrlDetails implements Comparable<UrlDetails> {
    public String displayTitle() {
        StringBuilder sb = new StringBuilder();

+        buildDisplayTitle(sb, title);
+
+        if (sb.isEmpty()) {
+            buildDisplayTitle(sb, url.toDisplayString());
+        }
+
+        return sb.toString();
+    }
+
+    private void buildDisplayTitle(StringBuilder sb, String str) {
+
        int distSinceBreak = 0;

        char c = ' ';
        int prevC = ' ';
-        for (int i = 0; i < title.length(); i++) {
+        for (int i = 0; i < str.length(); i++) {
            prevC = c;
-            c = title.charAt(i);
+            c = str.charAt(i);

            if (Character.isSpaceChar(c)) {
                distSinceBreak = 0;
@@ -135,8 +148,6 @@ public class UrlDetails implements Comparable<UrlDetails> {
                sb.append(c);
            }
        }
-
-        return sb.toString();
    }

    /** Helper that inserts hyphenation hints and escapes
@@ -180,7 +191,7 @@ public class UrlDetails implements Comparable<UrlDetails> {
     * semantically meaningful codepoints into entity codes */
    public String displayUrl() {
        StringBuilder sb = new StringBuilder();
-        String urlStr = url.toString();
+        String urlStr = url.toDisplayString();
        for (int i = 0; i < urlStr.length(); i++) {
            char c = urlStr.charAt(i);

--- a/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java
@@ -25,13 +25,28 @@ public class UrlDeduplicator {
    }

    public boolean shouldRemove(DecoratedSearchResultItem details) {
+        if (details.url.domain.topDomain.equals("slackware.com")) {
+            if (!deduplicateOnSuperficialHash(details)) {
+                logger.info("Rejecting on superficial hash " + details.url);
+                return true;
+            }
+            if (!deduplicateOnLSH(details)) {
+                logger.info("Rejecting on LSH for " + details.url);
+                return true;
+            }
+            if (!limitResultsPerDomain(details)) {
+                logger.info("Rejecting on limitResultsPerDomain for " + details.url);
+                return true;
+            }
+        }
+        else {
            if (!deduplicateOnSuperficialHash(details))
                return true;
            if (!deduplicateOnLSH(details))
                return true;
            if (!limitResultsPerDomain(details))
                return true;
-
+        }
        return false;
    }

--- a/code/services-application/search-service/resources/jte/part/head.jte
+++ b/code/services-application/search-service/resources/jte/part/head.jte
@@ -27,3 +27,9 @@
    <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">

 </head>
+<noscript>
+    <h1>Users of text-based browsers</h1>
+    <p>Consider using the old interface at <a href="https://old-search.marginalia.nu/">https://old-search.marginalia.nu/</a>,
+    as it uses fewer modern CSS tricks, and should work better than the new UI.  It's functionally nearly identical, but just renders it using a different layout.</p>
+    <hr>
+</noscript>
--- a/code/services-application/search-service/resources/jte/part/warmup.jte
+++ b/code/services-application/search-service/resources/jte/part/warmup.jte
@@ -1,9 +1,16 @@
 This is a bit of a hack!

 This class exists to let tailwind we're using these classes even though they aren't visible in the code,
-as we sometimes generate classes from Java code!
+as we sometimes generate classes from Java code or javascript!

 <i class="text-blue-800 bg-blue-50 dark:text-blue-200 dark:bg-blue-950"></i>
 <i class="text-green-800 bg-green-50 dark:text-green-200 dark:bg-green-950"></i>
 <i class="text-purple-800 bg-purple-50 dark:text-purple-200 dark:bg-purple-950"></i>
 <i class="text-blue-950 bg-gray-100 dark:text-blue-50 dark:bg-gray-900"></i>
+<span class="hover:bg-gray-300 "></span>
+
+<label class="suggestion group block relative">
+    <input type="radio" name="suggestion" class="peer hidden" checked>
+    <div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full">
+    </div>
+</label>
--- a/code/services-application/search-service/resources/jte/serp/error.jte
+++ b/code/services-application/search-service/resources/jte/serp/error.jte
@@ -26,7 +26,7 @@

    <!-- Main content -->
    <main class="flex-1 p-4 max-w-2xl space-y-4">
-        <div class="border dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
+        <div class="border border-gray-300 dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
            <div class="flex space-x-3  place-items-baseline">
                <i class="fa fa-circle-exclamation text-red-800"></i>
                <div class="grow">${model.errorTitle()}</div>
--- a/code/services-application/search-service/resources/jte/serp/part/footerHowto.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/footerHowto.jte
@@ -80,10 +80,6 @@
            <tr><td>rank&gt;50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
            <tr><td>rank&lt;50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>

-            <tr><td>count&gt;10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
-            <tr><td>count&lt;10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
-
-
            <tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
            <tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
            <tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>
--- a/code/services-application/search-service/resources/jte/serp/part/result.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/result.jte
@@ -21,6 +21,9 @@
                            </h2>

                            <div class="text-sm mt-1">
+                                @if ("PDF".equals(result.first.format))
+                                    <i title="PDF" class="fas fa-file-pdf text-red-500"></i>
+                                @endif
                                <a class="text-liteblue dark:text-blue-200 underline break-all" href="${result.first.url.toString()}"
                                   rel="noopener noreferrer" tabindex="-1">$unsafe{result.first.displayUrl()}</a>
                            </div>
@@ -53,10 +56,13 @@
            <div class="flex mt-2 text-sm flex flex-col space-y-2">
                <p class="text-black dark:text-white ${result.colorScheme.backgroundColor}  p-1 rounded break-words hyphens-auto">Also from ${result.getDomain().toString()}:</p>

-                <ul class="pl-2 mt-2 underline text-liteblue dark:text-blue-200">
+                <ul class="pl-2 mt-2 text-liteblue dark:text-blue-200">
                    @for(UrlDetails item : result.rest)
                        <li class="-indent-4 pl-4 mb-1 break-words hyphens-auto">
-                            <a href="${item.url.toString()}" rel="noopener noreferrer">$unsafe{item.displayTitle()}</a>
+                            @if ("PDF".equals(item.format))
+                                <i title="PDF" class="fas fa-file-pdf text-red-500"></i>
+                            @endif
+                            <a href="${item.url.toString()}" class="underline" rel="noopener noreferrer">$unsafe{item.displayTitle()}</a>
                        </li>
                    @endfor
                </ul>
@@ -74,6 +80,9 @@
            @if (DocumentFlags.PlainText.isPresent(result.getFirst().resultItem.encodedDocMetadata))
                <span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black  rounded">Plain text</span>
            @endif
+            @if (DocumentFlags.PdfFile.isPresent(result.getFirst().resultItem.encodedDocMetadata))
+                <span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black  rounded">PDF File</span>
+            @endif
            @if (DocumentFlags.GeneratorForum.isPresent(result.getFirst().resultItem.encodedDocMetadata))
                <span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black  rounded">Forum</span>
            @endif
--- a/code/services-application/search-service/resources/jte/serp/part/searchform.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/searchform.jte
@@ -7,13 +7,13 @@

 <form class="flex-1 max-w-2xl" action="/search">
    <div class="flex">
-        @if (query.isBlank())
+        @if (query != null && query.isBlank())
        <%-- Add autofocus if the query is blank --%>
            <input type="text"
                   class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
                   value="${query}"
                   autofocus
-                   placeholder="Search..."
+                   placeholder="Search the web!"
                   autocomplete="off"
                   name="query"
                   id="searchInput" />
@@ -21,13 +21,13 @@
            <input type="text"
                   class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
                   value="${query}"
-                   placeholder="Search..."
+                   placeholder="Search the web!"
                   autocomplete="off"
                   name="query"
                   id="searchInput" />
        @endif

-        <div id="searchSuggestions" class="text-sm absolute top-2 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-200 rounded-lg shadow-lg hidden"></div>
+        <div aria-hidden="true" id="searchSuggestions" class="text-sm absolute top-3 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-300 rounded-lg shadow-lg hidden"></div>

        <button class="px-4 py-2 bg-margeblue text-white ml-2 rounded whitespace-nowrap active:text-slate-200">
            <i class="fas fa-search text-sm sm:mr-3"></i>
--- a/code/services-application/search-service/resources/static/js/typeahead.js
+++ b/code/services-application/search-service/resources/static/js/typeahead.js
@@ -43,12 +43,12 @@ function displaySuggestions(suggestions) {
    }

    suggestionsContainer.innerHTML = suggestions.map((suggestion, index) => `
-                <div 
-                    class="suggestion px-4 py-2 cursor-pointer hover:bg-gray-100 ${index === selectedIndex ? 'bg-blue-50' : ''}"
-                    data-index="${index}"
-                >
+        <label class="suggestion group block relative">
+            <input type="radio" name="suggestion" class="peer hidden" ${index === selectedIndex ? 'checked' : ''}>
+            <div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full" data-index="${index}">
                ${suggestion}
            </div>
+        </label>
    `).join('');

    suggestionsContainer.classList.remove('hidden');
--- a/code/services-application/status-service/build.gradle
+++ b/code/services-application/status-service/build.gradle
@@ -2,7 +2,7 @@ plugins {
    id 'java'
    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.4'
+    id 'com.google.cloud.tools.jib' version '3.4.5'
 }

 java {
--- a/code/services-application/status-service/java/nu/marginalia/status/StatusModule.java
+++ b/code/services-application/status-service/java/nu/marginalia/status/StatusModule.java
@@ -20,6 +20,6 @@ public class StatusModule extends AbstractModule {
        bind(String.class)
                .annotatedWith(Names.named("searchEngineTestQuery"))
                .toInstance(System.getProperty("status-service.public-query",
-                        "https://search.marginalia.nu/search?query=plato&ref=marginalia-automatic-metrics"));
+                        "https://marginalia-search.com/search?query=plato&ref=marginalia-automatic-metrics"));
    }
 }
--- a/code/services-core/assistant-service/build.gradle
+++ b/code/services-core/assistant-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.4'
+    id 'com.google.cloud.tools.jib' version '3.4.5'
 }

 application {
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantModule.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantModule.java
@@ -10,7 +10,8 @@ import static com.google.inject.name.Names.named;

 public class AssistantModule extends AbstractModule {
    public void configure() {
-        bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt"));
+        bind(Path.class).annotatedWith(named("suggestions-file1")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
+        bind(Path.class).annotatedWith(named("suggestions-file2")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions3.txt.gz"));

        bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
    }
--- a/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/PrefixSearchStructure.java
+++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/suggest/PrefixSearchStructure.java
@@ -0,0 +1,465 @@
+package nu.marginalia.assistant.suggest;
+
+import gnu.trove.list.array.TIntArrayList;
+import org.jetbrains.annotations.NotNull;
+
+import java.util.*;
+
+/** Unhinged data structure for fast prefix searching.
+ */
+public class PrefixSearchStructure {
+    // Core data structures
+    private final HashMap<String, TIntArrayList> prefixIndex;     // Short prefix index (up to 8 chars)
+    private final HashMap<String, TIntArrayList> longPrefixIndex; // Long prefix index (9-16 chars)
+    private final ArrayList<String> words;                        // All words by ID
+    private final TIntArrayList wordScores;                       // Scores for all words
+
+    // Configuration
+    private static final int SHORT_PREFIX_LENGTH = 8;
+    private static final int MAX_INDEXED_PREFIX_LENGTH = 16;
+
+    public int size() {
+        return words.size();
+    }
+
+    // For sorting efficiency
+    private static class WordScorePair {
+        final String word;
+        final int score;
+
+        WordScorePair(String word, int score) {
+            this.word = word;
+            this.score = score;
+        }
+    }
+
+    /**
+     * Creates a new PrefixTrie for typeahead search.
+     */
+    public PrefixSearchStructure() {
+        prefixIndex = new HashMap<>(1024);
+        longPrefixIndex = new HashMap<>(1024);
+        words = new ArrayList<>(1024);
+        wordScores = new TIntArrayList(1024);
+    }
+
+    /**
+     * Adds a prefix to the index.
+     */
+    private void indexPrefix(String word, int wordId) {
+        // Index short prefixes
+        for (int i = 1; i <= Math.min(word.length(), SHORT_PREFIX_LENGTH); i++) {
+            String prefix = word.substring(0, i);
+            TIntArrayList wordIds = prefixIndex.computeIfAbsent(
+                    prefix, k -> new TIntArrayList(16));
+            wordIds.add(wordId);
+        }
+
+        // Index longer prefixes
+        for (int i = SHORT_PREFIX_LENGTH + 1; i <= Math.min(word.length(), MAX_INDEXED_PREFIX_LENGTH); i++) {
+            String prefix = word.substring(0, i);
+            TIntArrayList wordIds = longPrefixIndex.computeIfAbsent(
+                    prefix, k -> new TIntArrayList(8));
+            wordIds.add(wordId);
+        }
+
+        // If the word contains spaces, also index by each term for multi-word queries
+        if (word.contains(" ")) {
+            String[] terms = word.split("\\s+");
+            for (String term : terms) {
+                if (term.length() >= 2) {
+                    for (int i = 1; i <= Math.min(term.length(), SHORT_PREFIX_LENGTH); i++) {
+                        String termPrefix = "t:" + term.substring(0, i);
+                        TIntArrayList wordIds = prefixIndex.computeIfAbsent(
+                                termPrefix, k -> new TIntArrayList(16));
+                        wordIds.add(wordId);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Inserts a word with its associated score.
+     */
+    public void insert(String word, int score) {
+        if (word == null || word.isEmpty()) {
+            return;
+        }
+
+        // Add to the word list and index
+        int wordId = words.size();
+        words.add(word);
+        wordScores.add(score);
+        indexPrefix(word, wordId);
+    }
+
+    /**
+     * Returns the top k completions for a given prefix.
+     */
+    public List<ScoredSuggestion> getTopCompletions(String prefix, int k) {
+        if (prefix == null || prefix.isEmpty()) {
+            // Return top k words by score
+            return getTopKWords(k);
+        }
+
+        // Check if this is a term search (t:) - for searching within multi-word items
+        boolean isTermSearch = false;
+        if (prefix.startsWith("t:") && prefix.length() > 2) {
+            isTermSearch = true;
+            prefix = prefix.substring(2);
+        }
+
+        // 1. Fast path for short prefixes
+        if (prefix.length() <= SHORT_PREFIX_LENGTH) {
+            String lookupPrefix = isTermSearch ? "t:" + prefix : prefix;
+            TIntArrayList wordIds = prefixIndex.get(lookupPrefix);
+            if (wordIds != null) {
+                return getTopKFromWordIds(wordIds, k);
+            }
+        }
+
+        // 2. Fast path for long prefixes (truncate to MAX_INDEXED_PREFIX_LENGTH)
+        if (prefix.length() > SHORT_PREFIX_LENGTH) {
+            // Try exact match in longPrefixIndex first
+            if (prefix.length() <= MAX_INDEXED_PREFIX_LENGTH) {
+                TIntArrayList wordIds = longPrefixIndex.get(prefix);
+                if (wordIds != null) {
+                    return getTopKFromWordIds(wordIds, k);
+                }
+            }
+
+            // If prefix is longer than MAX_INDEXED_PREFIX_LENGTH, truncate and filter
+            if (prefix.length() > MAX_INDEXED_PREFIX_LENGTH) {
+                String truncatedPrefix = prefix.substring(0, MAX_INDEXED_PREFIX_LENGTH);
+                TIntArrayList candidateIds = longPrefixIndex.get(truncatedPrefix);
+                if (candidateIds != null) {
+                    // Filter candidates by the full prefix
+                    return getFilteredTopKFromWordIds(candidateIds, prefix, k);
+                }
+            }
+        }
+
+        // 3. Optimized fallback for long prefixes - use prefix tree for segments
+        List<ScoredSuggestion> results = new ArrayList<>();
+
+        // Handle multi-segment queries by finding candidates from first 8 chars
+        if (prefix.length() > SHORT_PREFIX_LENGTH) {
+            String shortPrefix = prefix.substring(0, Math.min(prefix.length(), SHORT_PREFIX_LENGTH));
+            TIntArrayList candidates = prefixIndex.get(shortPrefix);
+
+            if (candidates != null) {
+                return getFilteredTopKFromWordIds(candidates, prefix, k);
+            }
+        }
+
+        // 4. Last resort - optimized binary search in sorted segments
+        return findByBinarySearchPrefix(prefix, k);
+    }
+
+    /**
+     * Helper to get the top k words by score.
+     */
+    private List<ScoredSuggestion> getTopKWords(int k) {
+        // Create pairs of (score, wordId)
+        int[][] pairs = new int[words.size()][2];
+        for (int i = 0; i < words.size(); i++) {
+            pairs[i][0] = wordScores.get(i);
+            pairs[i][1] = i;
+        }
+
+        // Sort by score (descending)
+        Arrays.sort(pairs, (a, b) -> Integer.compare(b[0], a[0]));
+
+        // Take top k
+        List<ScoredSuggestion> results = new ArrayList<>();
+        for (int i = 0; i < Math.min(k, pairs.length); i++) {
+            String word = words.get(pairs[i][1]);
+            int score = pairs[i][0];
+            results.add(new ScoredSuggestion(word, score));
+        }
+
+        return results;
+    }
+
+    /**
+     * Helper to get the top k words from a list of word IDs.
+     */
+    private List<ScoredSuggestion> getTopKFromWordIds(TIntArrayList wordIds, int k) {
+        if (wordIds == null || wordIds.isEmpty()) {
+            return Collections.emptyList();
+        }
+
+        // For small lists, avoid sorting
+        if (wordIds.size() <= k) {
+            List<ScoredSuggestion> results = new ArrayList<>(wordIds.size());
+            int[] ids = wordIds.toArray();
+            for (int wordId : ids) {
+                if (wordId >= 0 && wordId < words.size()) {
+                    results.add(new ScoredSuggestion(words.get(wordId), wordScores.get(wordId)));
+                }
+            }
+            results.sort((a, b) -> Integer.compare(b.getScore(), a.getScore()));
+            return results;
+        }
+
+        // For larger lists, use an array-based approach for better performance
+        // Find top k without full sorting
+        int[] topScores = new int[k];
+        int[] topWordIds = new int[k];
+        int[] ids = wordIds.toArray();
+
+        // Initialize with first k elements
+        int filledCount = Math.min(k, ids.length);
+        for (int i = 0; i < filledCount; i++) {
+            int wordId = ids[i];
+            if (wordId >= 0 && wordId < words.size()) {
+                topWordIds[i] = wordId;
+                topScores[i] = wordScores.get(wordId);
+            }
+        }
+
+        // Sort initial elements
+        for (int i = 0; i < filledCount; i++) {
+            for (int j = i + 1; j < filledCount; j++) {
+                if (topScores[j] > topScores[i]) {
+                    // Swap scores
+                    int tempScore = topScores[i];
+                    topScores[i] = topScores[j];
+                    topScores[j] = tempScore;
+
+                    // Swap word IDs
+                    int tempId = topWordIds[i];
+                    topWordIds[i] = topWordIds[j];
+                    topWordIds[j] = tempId;
+                }
+            }
+        }
+
+        // Process remaining elements
+        int minScore = filledCount > 0 ? topScores[filledCount - 1] : Integer.MIN_VALUE;
+
+        for (int i = k; i < ids.length; i++) {
+            int wordId = ids[i];
+            if (wordId >= 0 && wordId < words.size()) {
+                int score = wordScores.get(wordId);
+
+                if (score > minScore) {
+                    // Replace the lowest element
+                    topScores[filledCount - 1] = score;
+                    topWordIds[filledCount - 1] = wordId;
+
+                    // Bubble up the new element
+                    for (int j = filledCount - 1; j > 0; j--) {
+                        if (topScores[j] > topScores[j - 1]) {
+                            // Swap scores
+                            int tempScore = topScores[j];
+                            topScores[j] = topScores[j - 1];
+                            topScores[j - 1] = tempScore;
+
+                            // Swap word IDs
+                            int tempId = topWordIds[j];
+                            topWordIds[j] = topWordIds[j - 1];
+                            topWordIds[j - 1] = tempId;
+                        } else {
+                            break;
+                        }
+                    }
+
+                    // Update min score
+                    minScore = topScores[filledCount - 1];
+                }
+            }
+        }
+
+        // Create result list
+        List<ScoredSuggestion> results = new ArrayList<>(filledCount);
+        for (int i = 0; i < filledCount; i++) {
+            results.add(new ScoredSuggestion(words.get(topWordIds[i]), topScores[i]));
+        }
+
+        return results;
+    }
+
+    /**
+     * Use binary search on sorted word segments to efficiently find matches.
+     */
+    private List<ScoredSuggestion> findByBinarySearchPrefix(String prefix, int k) {
+        // If we have a lot of words, use an optimized segment approach
+        if (words.size() > 1000) {
+            // Divide words into segments for better locality
+            int segmentSize = 1000;
+            int numSegments = (words.size() + segmentSize - 1) / segmentSize;
+
+            // Find matches using binary search within each segment
+            List<WordScorePair> allMatches = new ArrayList<>();
+            for (int segment = 0; segment < numSegments; segment++) {
+                int start = segment * segmentSize;
+                int end = Math.min(start + segmentSize, words.size());
+
+                // Binary search for first potential match
+                int pos = Collections.binarySearch(
+                        words.subList(start, end),
+                        prefix,
+                        (a, b) -> a.compareTo(b)
+                );
+
+                if (pos < 0) {
+                    pos = -pos - 1;
+                }
+
+                // Collect all matches
+                for (int i = start + pos; i < end && i < words.size(); i++) {
+                    String word = words.get(i);
+                    if (word.startsWith(prefix)) {
+                        allMatches.add(new WordScorePair(word, wordScores.get(i)));
+                    } else if (word.compareTo(prefix) > 0) {
+                        break; // Past potential matches
+                    }
+                }
+            }
+
+            // Sort by score and take top k
+            allMatches.sort((a, b) -> Integer.compare(b.score, a.score));
+            List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, allMatches.size()));
+            for (int i = 0; i < Math.min(k, allMatches.size()); i++) {
+                WordScorePair pair = allMatches.get(i);
+                results.add(new ScoredSuggestion(pair.word, pair.score));
+            }
+            return results;
+        }
+
+        // Fallback for small dictionaries - linear scan but optimized
+        return simpleSearchFallback(prefix, k);
+    }
+
+    /**
+     * Optimized linear scan - only used for small dictionaries.
+     */
+    private List<ScoredSuggestion> simpleSearchFallback(String prefix, int k) {
+        // Use primitive arrays for better cache locality
+        int[] matchScores = new int[Math.min(words.size(), 100)]; // Assume we won't find more than 100 matches
+        String[] matchWords = new String[matchScores.length];
+        int matchCount = 0;
+
+        for (int i = 0; i < words.size() && matchCount < matchScores.length; i++) {
+            String word = words.get(i);
+            if (word.startsWith(prefix)) {
+                matchWords[matchCount] = word;
+                matchScores[matchCount] = wordScores.get(i);
+                matchCount++;
+            }
+        }
+
+        // Sort matches by score (in-place for small arrays)
+        for (int i = 0; i < matchCount; i++) {
+            for (int j = i + 1; j < matchCount; j++) {
+                if (matchScores[j] > matchScores[i]) {
+                    // Swap scores
+                    int tempScore = matchScores[i];
+                    matchScores[i] = matchScores[j];
+                    matchScores[j] = tempScore;
+
+                    // Swap words
+                    String tempWord = matchWords[i];
+                    matchWords[i] = matchWords[j];
+                    matchWords[j] = tempWord;
+                }
+            }
+        }
+
+        // Create results
+        List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
+        for (int i = 0; i < Math.min(k, matchCount); i++) {
+            results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
+        }
+
+        return results;
+    }
+
+    /**
+     * Get top k words from candidate IDs, filtering by the full prefix.
+     */
+    private List<ScoredSuggestion> getFilteredTopKFromWordIds(TIntArrayList wordIds, String fullPrefix, int k) {
+        if (wordIds == null || wordIds.isEmpty()) {
+            return Collections.emptyList();
+        }
+
+        // Make primitive arrays for better performance
+        String[] matchWords = new String[Math.min(wordIds.size(), 1000)];
+        int[] matchScores = new int[matchWords.length];
+        int matchCount = 0;
+
+        int[] ids = wordIds.toArray();
+        for (int i = 0; i < ids.length && matchCount < matchWords.length; i++) {
+            int wordId = ids[i];
+            if (wordId >= 0 && wordId < words.size()) {
+                String word = words.get(wordId);
+                if (word.startsWith(fullPrefix)) {
+                    matchWords[matchCount] = word;
+                    matchScores[matchCount] = wordScores.get(wordId);
+                    matchCount++;
+                }
+            }
+        }
+
+        // Sort by score (efficient insertion sort for small k)
+        for (int i = 0; i < Math.min(matchCount, k); i++) {
+            int maxPos = i;
+            for (int j = i + 1; j < matchCount; j++) {
+                if (matchScores[j] > matchScores[maxPos]) {
+                    maxPos = j;
+                }
+            }
+            if (maxPos != i) {
+                // Swap
+                int tempScore = matchScores[i];
+                matchScores[i] = matchScores[maxPos];
+                matchScores[maxPos] = tempScore;
+
+                String tempWord = matchWords[i];
+                matchWords[i] = matchWords[maxPos];
+                matchWords[maxPos] = tempWord;
+            }
+        }
+
+        // Create result list (only up to k elements)
+        List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
+        for (int i = 0; i < Math.min(k, matchCount); i++) {
+            results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
+        }
+
+        return results;
+    }
+
+    /**
+     * Class representing a suggested completion.
+     */
+    public static class ScoredSuggestion implements Comparable<ScoredSuggestion> {
+        private final String word;
+        private final int score;
+
+        public ScoredSuggestion(String word, int score) {
+            this.word = word;
+            this.score = score;
+        }
+
+        public String getWord() {
+            return word;
+        }
+
+        public int getScore() {
+            return score;
+        }
+
+        @Override
+        public String toString() {
+            return word + " (" + score + ")";
+        }
+
+        @Override
+        public int compareTo(@NotNull PrefixSearchStructure.ScoredSuggestion o) {
+            return Integer.compare(this.score, o.score);
+        }
+    }
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Viktor Lofgren	97a6780ea3	(search) Add debug logging for specific query	2025-05-16 23:41:35 +02:00
Viktor Lofgren	eb634beec8	(search) Add debug logging for specific query	2025-05-16 23:34:03 +02:00
Viktor Lofgren	269ebd1654	Revert "(query) Add debug logging for specific query" This reverts commit `39ce40bfeb`.	2025-05-16 23:29:06 +02:00
Viktor Lofgren	39ce40bfeb	(query) Add debug logging for specific query	2025-05-16 23:23:53 +02:00
Viktor Lofgren	c187b2e1c1	(search) Re-enable clustering	2025-05-16 23:20:16 +02:00
Viktor Lofgren	42eaa4588b	(search) Disable clustering for a moment	2025-05-16 23:17:01 +02:00
Viktor Lofgren	4f40a5fbeb	(search) Reduce log spam	2025-05-16 23:15:07 +02:00
Viktor Lofgren	3f3d42bc01	(search) Re-enable deduplication	2025-05-16 23:14:54 +02:00
Viktor Lofgren	61c8d53e1b	(search) Disable deduplication for a moment	2025-05-16 23:10:32 +02:00
Viktor Lofgren	a7a3d85be9	(search) Increase search timeout by 50ms	2025-05-16 22:54:12 +02:00
Viktor Lofgren	306232fb54	(pdf) Fix handling of a few corner cases Deal better with documents which change font on blank spaces.	2025-05-13 18:44:28 +02:00
Viktor Lofgren	5aef844f0d	(dependency) Increase slop version to 0.0.11 v0.0.11 uses atomic moves. This ensures we don't encounter a race condition in the backup service with lingering .tmp-files that should have been renamed.	2025-05-12 14:09:16 +02:00
Viktor	d56b5c828a	Merge pull request #198 from MarginaliaSearch/process-pdf-files Add support for processing PDF files. The changeset adds a dependency on pdfbox, and vendors/modifies its PDFTextStripper to extract additional semantics from the documents. Since PDF documents aren't a text based format, but a graphical format which may contain a stream of characters and positions (sometimes overlapping, rotated, out of order) identifying something like a header or a paragraph is a non-trivial task, let alone extracting any text at all. A number of heuristics are used to try to accomplish this task, they aren't perfect, but about as good as you're going to get without going to something like a vision based LLM, which would be ridiculously expensive to apply at an internet search engine scale. The change also adds format information to the JSON API, as well as indicators in the GUI for PDF files.	2025-05-11 16:43:25 +02:00
Viktor Lofgren	ab58a4636f	(pdf) Disable tests that require specific sample data that can't go in the repo	2025-05-11 16:42:23 +02:00
Viktor Lofgren	00be269238	(search) Add PDF indicator in "also from"-segment	2025-05-11 16:35:52 +02:00
Viktor Lofgren	879e6a9424	(pdf) Identify additional headings based on font weight	2025-05-11 16:35:52 +02:00
Viktor Lofgren	fba3455732	(pdf) Clean up code	2025-05-11 16:35:52 +02:00
Viktor Lofgren	14283da7f5	(pdf) Clean up generated DOM Sometimes empty <p>-tags are inserted, which messes with the header joining process. Removes those nodes.	2025-05-11 15:12:09 +02:00
Viktor Lofgren	93df4d1fc0	(pdf) Improve summary extraction for PDFs	2025-05-11 14:33:11 +02:00
Viktor Lofgren	b12a0b998c	(pdf) Use smarter heuristics for paragraph splitting We look at the median line distance, with outliers removed, to figure out when to break lines, as the original approach works poorly with e.g. double line spaced documents.	2025-05-11 14:29:42 +02:00
Viktor Lofgren	3b6f4e321b	(search) Add red PDF indicator to search UI	2025-05-11 13:32:14 +02:00
Viktor Lofgren	8428111771	(pdf) Fix for exception when no text positions are available	2025-05-10 15:12:02 +02:00
Viktor Lofgren	e9fd4415ef	(pdf) Merge consecutive headings. Headings don't follow the same indentation rules as prose and tend to be cut off into multiple "paragraphs" by the text extractor.	2025-05-10 14:38:43 +02:00
Viktor Lofgren	4c95c3dcad	(pdf) Don't look for headings below 75% of the max y-position	2025-05-10 14:38:02 +02:00
Viktor Lofgren	c5281536fb	(api) Add format field to JSON search results API consumers might want to filter out PDF results, etc.	2025-05-10 13:56:22 +02:00
Viktor Lofgren	4431dae7ac	(refac) Rename HtmlStandard -> DocumentFormat The old model made some sense when we only supported HTML and to some extent plain text, but having PDF in an enum called HtmlFormat is a bit of a stretch.	2025-05-10 13:47:26 +02:00
Viktor Lofgren	4df4d0a7a8	(pdf) Increase line spacing tolerance for better paragraph handling	2025-05-10 13:34:04 +02:00
Viktor Lofgren	9f05083b94	(pdf) Add the capability to identify headings This change vendors pdfbox'es PDFTextStripper and modifies it to be able to heuristically identify headings based on their font size, as this is a very useful relevance signal for the search engine, and helps identify the correct title of the article.	2025-05-09 14:04:04 +02:00
Viktor Lofgren	fc92e9b9c0	(feeds) Correct link handling in atom feeds This addresses issue #199	2025-05-09 13:00:07 +02:00
Viktor Lofgren	328fb5d927	(feeds) Correct link handling in atom feeds This addresses issue #199	2025-05-09 12:55:28 +02:00
Viktor Lofgren	36889950e8	(pdf) Migrate to PDFBox 3.0.5 and suppress log spam PDFBox 2.x uses commons logging, which does not route through SLF4j, and thus is a hassle to configure; and is extremely verbose in its default logging settings. Migrating to PDFBox 3.x lets us use slf4j to address the log spam by filtering out the noisy methods.	2025-05-08 18:03:26 +02:00
Viktor Lofgren	c96a94878b	(pdf) Add feature to make pdf-files searchable with format:pdf	2025-05-08 18:03:26 +02:00
Viktor Lofgren	1c57d7d73a	(pdf) Clean up code	2025-05-08 18:03:26 +02:00
Viktor Lofgren	a443d22356	(pdf) Flag the file as a PDF file in the GUI	2025-05-08 18:03:26 +02:00
Viktor Lofgren	aa59d4afa4	(pdf) Somewhat improve title and summary extraction	2025-05-08 18:03:26 +02:00
Viktor Lofgren	df0f18d0e7	(pdf) Read title	2025-05-08 18:03:26 +02:00
Viktor Lofgren	0819d46f97	(pdf) Minimal protytype to get PDFs working	2025-05-08 18:03:26 +02:00
Viktor Lofgren	5e2b63473e	(logging) Change to a terser log format The old log format would often span several screen widths, especially when subprocesses logged. Switching to a terser format that should be much easier to read.	2025-05-08 18:02:22 +02:00
Viktor	f9590703f1	Merge pull request #197 from MarginaliaSearch/crawl-markdown (markdown) Support crawling markdown	2025-05-08 13:35:00 +02:00
Viktor Lofgren	f12fc11337	(markdown) Support crawling markdown	2025-05-08 13:26:22 +02:00
Viktor Lofgren	c309030184	(sample) Ensure we finalize the slop.zip file creation when filtering	2025-05-06 14:52:48 +02:00
Viktor Lofgren	fd5af01629	(sample) Ensure we flush the log before adding it to the tar file	2025-05-06 14:43:47 +02:00
Viktor Lofgren	d4c43c7a79	(crawler) Test case for fetching PDFs	2025-05-06 13:45:16 +02:00
Viktor Lofgren	18700e1919	(sample) Fix bug where slop files would not be saved despite containing data	2025-05-06 13:38:21 +02:00
Viktor Lofgren	120b431998	(crawler) Fix outdated assumptions about content types and http status codes always being 200 when good. We now sometimes get 206 when good.	2025-05-06 13:18:30 +02:00
Viktor Lofgren	71dad99326	(crawler) Revisitor should not demand a 200, but support a 206 as well	2025-05-06 13:11:52 +02:00
Viktor Lofgren	c1e8afdf86	(crawler) Remove domains from pending crawl tasks queue when retrying	2025-05-06 12:56:30 +02:00
Viktor Lofgren	fa32dddc24	(sample-actor) Make content type matching lenient with regard to ct parameters such as charset	2025-05-06 12:48:09 +02:00
Viktor Lofgren	a266fcbf30	(sample-actor) Clean up debris from previous runs to avoid errors on re-runs	2025-05-05 13:16:37 +02:00
Viktor Lofgren	6e47e58e0e	(sample-actor) Add progress tracking to sample export actor	2025-05-05 13:04:14 +02:00
Viktor Lofgren	9dc43d8b4a	(sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable.	2025-05-05 12:56:12 +02:00
Viktor Lofgren	83967e3305	(sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable.	2025-05-05 12:50:21 +02:00
Viktor Lofgren	4db980a291	(jooby-service) Set an upper limit on the number of worker threads	2025-05-05 12:40:31 +02:00
Viktor Lofgren	089b177868	(deploy) Executor partition 4.	2025-05-05 12:21:27 +02:00
Viktor Lofgren	9c8e9a68d5	(deploy) Executor partition 4.	2025-05-05 12:00:05 +02:00
Viktor Lofgren	413d5cc788	(url, minor) Fix typo in test	2025-05-04 16:28:30 +02:00
Viktor Lofgren	58539b92ac	(search) Don't show addresses with URLencoding in the UI	2025-05-04 16:26:39 +02:00
Viktor Lofgren	fe72f16df1	(url) Add additional tests for parameter handling	2025-05-04 16:23:39 +02:00
Viktor Lofgren	b49a244a2e	(url) Fix encoding handling of query parameters	2025-05-04 16:18:47 +02:00
Viktor Lofgren	3f0b4c010f	(deploy) Fix deploy script to be aware of the status service	2025-05-04 16:14:07 +02:00
Viktor Lofgren	c6e0cd93f7	(status) Fix status service to poll the new domain	2025-05-04 16:11:08 +02:00
Viktor Lofgren	80a7ccb080	Trigger redeploy of qs, search and api	2025-05-04 16:07:28 +02:00
Viktor Lofgren	54dec347c4	(url) Fix urlencoding issues with certain symbols Optimize the code by adding a simple heuristic for guessing whether we need to repair the URI before we pass it to Java's parser.	2025-05-04 13:39:39 +02:00
Viktor Lofgren	d6ee3f0785	(url) Fix urlencoding issues with certain symbols The urlencoding logic would consider the need to urlencode on an element basis, which is incorrect. Even if we urlencode on an element basis, we should either urlencode or not urlencode, never a mix of the two.	2025-05-04 13:08:49 +02:00
Viktor Lofgren	8be88afcf3	(url) Fix urlencoding issues with certain symbols We also need to apply the fix when performing toString() on the EdgeUrl, the URI class will URLDecode the input. The change also alters the parseURI method to only run the URLEncode-fixer during parsing if URI doesn't throw an exception. This bad path is obviously going to be slower, but realistically, most URLs are valid, so it's probably a significant optimization to do it like this.	2025-05-04 12:58:13 +02:00
Viktor Lofgren	0e3c00d3e1	(url) Fix urlencoding issues with certain symbols Minor fix of issue where url sanitizer would strip some trailing slashes.	2025-05-03 23:58:28 +02:00
Viktor Lofgren	4279a7f1aa	(url) Fix urlencoding issues with certain symbols Minor fix with previously urlencoded codepoints, we need to account for the fact that they are encoded in hexadecimal.	2025-05-03 23:51:39 +02:00
Viktor Lofgren	251006d4f9	(url) Fix urlencoding issues with certain symbols Problems primarily cropped up with sideloaded wikipedia articles, though the search engine has been returning inconsistently URLEncoded search results for a while, though browsers and servers have seemingly magically fixed the issues in many scenarios. This addresses Issue #195 and Issue #131.	2025-05-03 23:48:45 +02:00
Viktor Lofgren	c3e99dc12a	(service) Limit logging from ad hoc task heartbeats Certain usage patterns of the ad hoc task heartbeats would lead to an incredible amount of log noise, as it would log each update. Limit log updates to increments of 10% to avoid this problem.	2025-05-03 12:39:58 +02:00
Viktor	aaaa2de022	Merge pull request #196 from MarginaliaSearch/filter-export-sample-data Add the ability to filter sample data based on content type	2025-05-02 13:23:49 +02:00
Viktor Lofgren	fc1388422a	(actor) Add the ability to filter sample data based on content type This will help in extracting relevant test sets for PDF processing.	2025-05-02 13:09:22 +02:00
Viktor Lofgren	b07080db16	(crawler) Don't retry requests when encountering UnknownHostException	2025-05-01 16:07:34 +02:00
Viktor Lofgren	e9d86dca4a	(crawler) Add timeout to wrap-up phase of WarcInputBuffer.	2025-05-01 15:57:47 +02:00
Viktor Lofgren	1d693f0efa	(build) Upgrade JIB to 3.4.5	2025-04-30 15:26:52 +02:00
Viktor Lofgren	5874a163dc	(build) Upgrade gradle to 8.14	2025-04-30 15:26:37 +02:00
Viktor Lofgren	5ec7a1deab	(crawler) Fix 80%-ish progress crawler stall Since the crawl tasks are started in two phases, first when generating them in one loop, and then in a second loop that drains the task list; if the first loop contains a long-running crawl task that is triggered late, the rest of the crawl may halt until that task is finish. Fixed the problem by draining and re-trying also in the first loop.	2025-04-29 12:23:51 +02:00
Viktor Lofgren	7fea2808ed	(search) Fix error view Fix rendering error when query was null Fix border on error message.	2025-04-27 12:12:56 +02:00
Viktor Lofgren	8da74484f0	(search) Remove unused count modifier from the footer help	2025-04-27 12:08:34 +02:00
Viktor Lofgren	923d5a7234	(search) Add a note for TUI users pointing them to the old UI	2025-04-27 11:52:07 +02:00
Viktor Lofgren	58f88749b8	(deploy) assistant	2025-04-25 13:25:50 +02:00
Viktor Lofgren	77f727a5ba	(crawler) Alter conditional request logic to avoid sending both If-None-Match and If-Modified-Since It seems like some servers dislike this combination, and may turn a 304 into a 200.	2025-04-25 13:19:07 +02:00
Viktor Lofgren	667cfb53dc	(assistant) Remove more link text junk from suggestions at loadtime.	2025-04-24 13:35:29 +02:00
Viktor Lofgren	fe36d4ed20	(deploy) Executor services	2025-04-24 13:23:51 +02:00
Viktor Lofgren	acf4bef98d	(assistant) Improve search suggestions Improve suggestions by loading a secondary suggestions set with link text data.	2025-04-24 13:10:59 +02:00
Viktor Lofgren	2a737c34bb	(search) Improve suggestions UX Fix the highlight colors when arrowing through search suggestions. Also fix the suggestions box for dark mode.	2025-04-24 12:34:05 +02:00
Viktor Lofgren	90a577af82	(search) Improve suggestions UX	2025-04-24 00:32:25 +02:00
Viktor	f0c9b935d8	Merge pull request #192 from MarginaliaSearch/improve-suggestions Improve typeahead suggestions	2025-04-23 20:17:49 +02:00
Viktor Lofgren	7b5493dd51	(assistant) Improve typeahead suggestions Implement a new prefix search structure (not a trie, but hash table based) with a concept of score.	2025-04-23 20:13:53 +02:00
Viktor Lofgren	c246a59158	(search) Make it clearer that it's a search engine	2025-04-22 16:03:42 +02:00
Viktor	0b99781d24	Merge pull request #191 from MarginaliaSearch/pdf-support-in-crawler Pdf support in crawler	2025-04-22 15:52:41 +02:00
Viktor Lofgren	39db9620c1	(crawler) Increase maximum permitted file size to 32 MB	2025-04-22 15:51:03 +02:00
Viktor Lofgren	1781599363	(crawler) Add support for crawling PDF files	2025-04-22 15:50:05 +02:00
Viktor Lofgren	6b2d18fb9b	(crawler) Adjust domain limits to be generally more permissive.	2025-04-22 15:27:57 +02:00
Viktor	59b1d200ab	Merge pull request #190 from MarginaliaSearch/download-sample-chores Download sample chores	2025-04-22 13:29:49 +02:00
Viktor Lofgren	897010a2cf	(control) Update download sample data actor with better UI The original implementation didn't really give a lot of feedback about what it was doing. Adding a progress bar to the download step. Relates to issue 189.	2025-04-22 13:27:22 +02:00
Viktor Lofgren	602af7a77e	(control) Update UI with new sample sizes Relates to issue 189.	2025-04-22 13:27:13 +02:00
Viktor Lofgren	a7d91c8527	(crawler) Clean up fetcher detailed logging	2025-04-21 12:53:52 +02:00
Viktor Lofgren	7151602124	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Cleaning up after changes.	2025-04-21 12:47:03 +02:00
Viktor Lofgren	884e33bd4a	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change back to an unbounded queue, tighten sleep times a bit.	2025-04-21 11:48:15 +02:00
Viktor Lofgren	e84d5c497a	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.	2025-04-21 00:39:26 +02:00
Viktor Lofgren	2d2d3e2466	(crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.	2025-04-21 00:36:48 +02:00