(deploy) Correct deploy script for headlesschrome

(deploy) Deploy assistant and browserless
Merge pull request #201 from MarginaliaSearch/website-capture
2025-10-05 21:22:39 +02:00 · 2025-05-28 15:56:05 +02:00 · 2025-05-28 15:50:26 +02:00 · 2025-05-28 15:49:03 +02:00 · 2025-05-28 15:48:32 +02:00 · 2025-05-28 15:40:09 +02:00
145 changed files with 6407 additions and 864 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -5,7 +5,7 @@ plugins {

    // This is a workaround for a bug in the Jib plugin that causes it to stall randomly
    // https://github.com/GoogleContainerTools/jib/issues/3347
-    id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
+    id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
 }

 group 'marginalia'
@@ -47,7 +47,7 @@ ext {
    dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
    dockerImageTag='latest'
    dockerImageRegistry='marginalia'
-    jibVersion = '3.4.4'
+    jibVersion = '3.4.5'
 }

 idea {
--- a/code/common/model/java/nu/marginalia/model/DocumentFormat.java
+++ b/code/common/model/java/nu/marginalia/model/DocumentFormat.java
@@ -0,0 +1,24 @@
+package nu.marginalia.model;
+
+public enum DocumentFormat {
+    PLAIN(0, 1, "text"),
+    PDF(0, 1, "pdf"),
+    UNKNOWN(0, 1, "???"),
+    HTML123(0, 1, "html"),
+    HTML4(-0.1, 1.05, "html"),
+    XHTML(-0.1, 1.05, "html"),
+    HTML5(0.5, 1.1, "html");
+
+    /** Used to tune quality score */
+    public final double offset;
+    /** Used to tune quality score */
+    public final double scale;
+    public final String shortFormat;
+
+    DocumentFormat(double offset, double scale, String shortFormat) {
+        this.offset = offset;
+        this.scale = scale;
+        this.shortFormat = shortFormat;
+    }
+
+}
--- a/code/common/model/java/nu/marginalia/model/EdgeDomain.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeDomain.java
@@ -112,14 +112,6 @@ public class EdgeDomain implements Serializable {
        return topDomain;
    }

-    public String getDomainKey() {
-        int cutPoint = topDomain.indexOf('.');
-        if (cutPoint < 0) {
-            return topDomain;
-        }
-        return topDomain.substring(0, cutPoint).toLowerCase();
-    }
-
    /** If possible, try to provide an alias domain,
     * i.e. a domain name that is very likely to link to this one
     * */
--- a/code/common/model/java/nu/marginalia/model/EdgeUrl.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeUrl.java
@@ -1,16 +1,14 @@
 package nu.marginalia.model;

 import nu.marginalia.util.QueryParams;
+import org.apache.commons.lang3.StringUtils;

 import javax.annotation.Nullable;
 import java.io.Serializable;
-import java.net.MalformedURLException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
+import java.net.*;
+import java.nio.charset.StandardCharsets;
 import java.util.Objects;
 import java.util.Optional;
-import java.util.regex.Pattern;

 public class EdgeUrl implements Serializable {
    public final String proto;
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {

    private static URI parseURI(String url) throws URISyntaxException {
        try {
-            return new URI(urlencodeFixer(url));
+            return EdgeUriFactory.parseURILenient(url);
        } catch (URISyntaxException ex) {
            throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
        }
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
        }
    }

-    private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
-
-    /* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
-
-       Here on the Internet, standards are like the picture on the box of the frozen pizza,
-       and what you get is more like what's on the inside, we try to patch things instead,
-       just give it a best-effort attempt att cleaning out broken or unnecessary constructions
-       like bad or missing URLEncoding
-     */
-    public static String urlencodeFixer(String url) throws URISyntaxException {
-        var s = new StringBuilder();
-        String goodChars = "&.?:/-;+$#";
-        String hexChars = "0123456789abcdefABCDEF";
-
-        int pathIdx = findPathIdx(url);
-        if (pathIdx < 0) { // url looks like http://marginalia.nu
-            return url + "/";
-        }
-        s.append(url, 0, pathIdx);
-
-        // We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
-        int end = url.indexOf("#");
-        if (end < 0) end = url.length();
-
-        for (int i = pathIdx; i < end; i++) {
-            int c = url.charAt(i);
-
-            if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
-                s.appendCodePoint(c);
-            } else if (c == '%' && i + 2 < end) {
-                int cn = url.charAt(i + 1);
-                int cnn = url.charAt(i + 2);
-                if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
-                    s.appendCodePoint(c);
-                } else {
-                    s.append("%25");
-                }
-            } else {
-                s.append(String.format("%%%02X", c));
-            }
-        }
-
-        return s.toString();
-    }
-
-    private static int findPathIdx(String url) throws URISyntaxException {
-        int colonIdx = url.indexOf(':');
-        if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
-            throw new URISyntaxException(url, "Lacking protocol");
-        }
-        return url.indexOf('/', colonIdx + 2);
-    }

    public EdgeUrl(URI URI) {
        try {
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
            sb.append(port);
        }

+        EdgeUriFactory.urlencodePath(sb, path);
+
+        if (param != null) {
+            EdgeUriFactory.urlencodeQuery(sb, param);
+        }
+
+        return sb.toString();
+    }
+
+
+    public String toDisplayString() {
+        StringBuilder sb = new StringBuilder(256);
+
+        sb.append(proto);
+        sb.append("://");
+        sb.append(domain);
+
+        if (port != null) {
+            sb.append(':');
+            sb.append(port);
+        }
+
        sb.append(path);

        if (param != null) {
-            sb.append('?');
-            sb.append(param);
+            sb.append('?').append(param);
        }

        return sb.toString();
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
    }

 }
+
+class EdgeUriFactory {
+    public static URI parseURILenient(String url) throws URISyntaxException {
+
+        if (shouldOmitUrlencodeRepair(url)) {
+            try {
+                return new URI(url);
+            }
+            catch (URISyntaxException ex) {
+                // ignore and run the lenient parser
+            }
+        }
+
+        var s = new StringBuilder(url.length()+8);
+
+        int pathIdx = findPathIdx(url);
+        if (pathIdx < 0) { // url looks like http://marginalia.nu
+            return new URI(url + "/");
+        }
+        s.append(url, 0, pathIdx);
+
+        // We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
+        int end = url.indexOf("#");
+        if (end < 0) end = url.length();
+
+        int queryIdx = url.indexOf('?');
+        if (queryIdx < 0) queryIdx = end;
+
+        urlencodePath(s, url.substring(pathIdx, queryIdx));
+        if (queryIdx < end) {
+            urlencodeQuery(s, url.substring(queryIdx + 1, end));
+        }
+        return new URI(s.toString());
+    }
+
+    /** Break apart the path element of an URI into its components, and then
+     * urlencode any component that needs it, and recombine it into a single
+     * path element again.
+     */
+    public static void urlencodePath(StringBuilder sb, String path) {
+        if (path == null || path.isEmpty()) {
+            return;
+        }
+
+        String[] pathParts = StringUtils.split(path, '/');
+        if (pathParts.length == 0) {
+            sb.append('/');
+            return;
+        }
+
+        boolean shouldUrlEncode = false;
+        for (String pathPart : pathParts) {
+            if (pathPart.isEmpty()) continue;
+
+            if (needsUrlEncode(pathPart)) {
+                shouldUrlEncode = true;
+                break;
+            }
+        }
+
+        for (String pathPart : pathParts) {
+            if (pathPart.isEmpty()) continue;
+
+            if (shouldUrlEncode) {
+                sb.append('/');
+                sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
+            } else {
+                sb.append('/');
+                sb.append(pathPart);
+            }
+        }
+
+        if (path.endsWith("/")) {
+            sb.append('/');
+        }
+
+    }
+
+    /** Break apart the query element of a URI into its components, and then
+     * urlencode any component that needs it, and recombine it into a single
+     * query element again.
+     */
+    public static void urlencodeQuery(StringBuilder sb, String param) {
+        if (param == null || param.isEmpty()) {
+            return;
+        }
+
+        String[] queryParts = StringUtils.split(param, '&');
+
+        boolean shouldUrlEncode = false;
+        for (String queryPart : queryParts) {
+            if (queryPart.isEmpty()) continue;
+
+            if (needsUrlEncode(queryPart)) {
+                shouldUrlEncode = true;
+                break;
+            }
+        }
+
+        boolean first = true;
+        for (String queryPart : queryParts) {
+            if (queryPart.isEmpty()) continue;
+
+            if (first) {
+                sb.append('?');
+                first = false;
+            } else {
+                sb.append('&');
+            }
+
+            if (shouldUrlEncode) {
+                int idx = queryPart.indexOf('=');
+                if (idx < 0) {
+                    sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
+                } else {
+                    sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
+                    sb.append('=');
+                    sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
+                }
+            } else {
+                sb.append(queryPart);
+            }
+        }
+    }
+
+    /** Test if the url element needs URL encoding.
+     * <p></p>
+     * Note we may have been given an already encoded path element,
+     * so we include % and + in the list of good characters
+     */
+    static boolean needsUrlEncode(String urlElement) {
+        for (int i = 0; i < urlElement.length(); i++) {
+            char c = urlElement.charAt(i);
+
+            if (isUrlSafe(c)) continue;
+            if ("+".indexOf(c) >= 0) continue;
+            if (c == '%' && i + 2 < urlElement.length()) {
+                char c1 = urlElement.charAt(i + 1);
+                char c2 = urlElement.charAt(i + 2);
+                if (isHexDigit(c1) && isHexDigit(c2)) {
+                    i += 2;
+                    continue;
+                }
+            }
+
+            return true;
+        }
+
+        return false;
+    }
+
+
+    static boolean isUrlSafe(int c) {
+        if (c >= 'a' && c <= 'z') return true;
+        if (c >= 'A' && c <= 'Z') return true;
+        if (c >= '0' && c <= '9') return true;
+        if (c == '-' || c == '_' || c == '.' || c == '~') return true;
+
+        return false;
+    }
+
+    /** Test if the URL is a valid URL that does not need to be
+     * urlencoded.
+     * <p></p>
+     * This is a very simple heuristic test that does not guarantee
+     * that the URL is valid, but it will identify cases where we
+     * are fairly certain that the URL does not need encoding,
+     * so we can skip a bunch of allocations and string operations
+     * that would otherwise be needed to fix the URL.
+     */
+    static boolean shouldOmitUrlencodeRepair(String url) {
+        int idx = 0;
+        final int len = url.length();
+
+        // Validate the scheme
+        while (idx < len - 2) {
+            char c = url.charAt(idx++);
+            if (c == ':') break;
+            if (!isAsciiAlphabetic(c)) return false;
+        }
+        if (url.charAt(idx++) != '/') return false;
+        if (url.charAt(idx++) != '/') return false;
+
+        // Validate the authority
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '/') break;
+            if (c == ':') continue;
+            if (c == '@') continue;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        // Validate the path
+        if (idx >= len) return true;
+
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '?') break;
+            if (c == '/') continue;
+            if (c == '#') return true;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        if (idx >= len) return true;
+
+        // Validate the query
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '&') continue;
+            if (c == '=') continue;
+            if (c == '#') return true;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        return true;
+    }
+
+
+    private static boolean isAsciiAlphabetic(int c) {
+        return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    private static boolean isHexDigit(int c) {
+        return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    /** Find the index of the path element in a URL.
+     * <p></p>
+     * The path element starts after the scheme and authority part of the URL,
+     * which is everything up to and including the first slash after the colon.
+     */
+    private static int findPathIdx(String url) throws URISyntaxException {
+        int colonIdx = url.indexOf(':');
+        if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
+            throw new URISyntaxException(url, "Lacking scheme");
+        }
+        return url.indexOf('/', colonIdx + 3);
+    }
+
+
+}
--- a/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java
+++ b/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java
@@ -28,6 +28,8 @@ public enum HtmlFeature {

    GA_SPAM("special:gaspam"),

+    PDF("format:pdf"),
+
    /** For fingerprinting and ranking */
    OPENGRAPH("special:opengraph"),
    OPENGRAPH_IMAGE("special:opengraph:image"),
--- a/code/common/model/java/nu/marginalia/model/html/HtmlStandard.java
+++ b/code/common/model/java/nu/marginalia/model/html/HtmlStandard.java
@@ -1,22 +0,0 @@
-package nu.marginalia.model.html;
-
-// This class really doesn't belong anywhere, but will squat here for now
-public enum HtmlStandard {
-    PLAIN(0, 1),
-    UNKNOWN(0, 1),
-    HTML123(0, 1),
-    HTML4(-0.1, 1.05),
-    XHTML(-0.1, 1.05),
-    HTML5(0.5, 1.1);
-
-    /** Used to tune quality score */
-    public final double offset;
-    /** Used to tune quality score */
-    public final double scale;
-
-    HtmlStandard(double offset, double scale) {
-        this.offset = offset;
-        this.scale = scale;
-    }
-
-}
--- a/code/common/model/java/nu/marginalia/model/idx/DocumentFlags.java
+++ b/code/common/model/java/nu/marginalia/model/idx/DocumentFlags.java
@@ -9,7 +9,7 @@ public enum DocumentFlags {
    GeneratorForum,
    GeneratorWiki,
    Sideloaded,
-    Unused7,
+    PdfFile,
    Unused8,
    ;

--- a/code/common/model/test/nu/marginalia/model/EdgeDomainTest.java
+++ b/code/common/model/test/nu/marginalia/model/EdgeDomainTest.java
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;

 class EdgeDomainTest {

-    @Test
-    public void testSkepdic() throws URISyntaxException {
-        var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
-        assertEquals("skepdic", domain.getDomain().getDomainKey());
-        var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
-        assertEquals("skepdic", domain2.getDomain().getDomainKey());
-    }
-
    @Test
    public void testHkDomain() throws URISyntaxException {
        var domain = new EdgeUrl("http://l7072i3.l7c.net");
--- a/code/common/model/test/nu/marginalia/model/EdgeUrlTest.java
+++ b/code/common/model/test/nu/marginalia/model/EdgeUrlTest.java
@@ -1,6 +1,6 @@
 package nu.marginalia.model;

-import nu.marginalia.model.EdgeUrl;
+import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;

 import java.net.URISyntaxException;
@@ -21,25 +21,70 @@ class EdgeUrlTest {
                new EdgeUrl("https://memex.marginalia.nu/#here")
        );
    }
+
    @Test
-    public void testParam() throws URISyntaxException {
-        System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
-        System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
-    }
-    @Test
-    void urlencodeFixer() throws URISyntaxException {
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
+    void testUriFromString() throws URISyntaxException {
+        // We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
+        // converting it back to a string, we want to ensure there is no changes along the way.
+
+        Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
+        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
+        Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
+
+        Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
+        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
+        Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
+
+        Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
+        Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
+        Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
+
+        Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
+
+        Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
+
+        Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
+
+        Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
+        Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
+        Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
+
+        Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
+        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
+        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
+
+        Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
    }

    @Test
    void testParms() throws URISyntaxException {
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
+        Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
+
+        Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
+
+        Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
+
+        Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
+                new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
+
+
+        Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
+
+        Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
+
+        Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
+        Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
    }
 }
--- a/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java
+++ b/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java
@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
     */
    @Override
    public void progress(String step, int stepProgress, int stepCount) {
+        int lastProgress = this.progress;
        this.step = step;
-
-
-        // off by one since we calculate the progress based on the number of steps,
-        // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
-        // final progress being 80% and not 100%)
-
        this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);

-        logger.info("ProcessTask {} progress: {}%", taskBase, progress);
+        if (this.progress / 10 != lastProgress / 10) {
+            logger.info("ProcessTask {} progress: {}%", taskBase, progress);
+        }
    }

    /** Wrap a collection to provide heartbeat progress updates as it's iterated through */
--- a/code/common/service/java/nu/marginalia/service/control/ServiceAdHocTaskHeartbeatImpl.java
+++ b/code/common/service/java/nu/marginalia/service/control/ServiceAdHocTaskHeartbeatImpl.java
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
     */
    @Override
    public void progress(String step, int stepProgress, int stepCount) {
+        int lastProgress = this.progress;
        this.step = step;
-
-
-        // off by one since we calculate the progress based on the number of steps,
-        // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
-        // final progress being 80% and not 100%)
-
        this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);

-        logger.info("ServiceTask {} progress: {}%", taskBase, progress);
+        if (this.progress / 10 != lastProgress / 10) {
+            logger.info("ProcessTask {} progress: {}%", taskBase, progress);
+        }
    }

    public void shutDown() {
--- a/code/common/service/java/nu/marginalia/service/server/JoobyService.java
+++ b/code/common/service/java/nu/marginalia/service/server/JoobyService.java
@@ -122,6 +122,11 @@ public class JoobyService {
        // single digit percentage difference since HTML already compresses very well with level = 1.
        options.setCompressionLevel(1);

+        // Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
+        // multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
+        // scenario
+        options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
+

        jooby.setServerOptions(options);

--- a/code/common/service/resources/log4j2-json.xml
+++ b/code/common/service/resources/log4j2-json.xml
@@ -3,11 +3,18 @@
        <Console name="Console" target="SYSTEM_OUT">
            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
            </Filters>
        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
+            </Filters>
+        </Console>
        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
@@ -15,6 +22,7 @@
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
            </Filters>
            <SizeBasedTriggeringPolicy size="10MB" />
        </RollingFile>
@@ -31,9 +39,11 @@
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
            <AppenderRef ref="Console"/>
+            <AppenderRef ref="ProcessConsole"/>
            <AppenderRef ref="LogToFile"/>
        </Root>
    </Loggers>
--- a/code/common/service/resources/log4j2-prod.xml
+++ b/code/common/service/resources/log4j2-prod.xml
@@ -1,13 +1,51 @@
 <Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
    <Appenders>
-        <Console name="Console" target="SYSTEM_OUT">
-            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
+        <Console name="ConsoleInfo" target="SYSTEM_OUT">
+            <PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
            <Filters>
+                <LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
            </Filters>
        </Console>
+        <Console name="ConsoleWarn" target="SYSTEM_OUT">
+            <PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleError" target="SYSTEM_OUT">
+            <PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleFatal" target="SYSTEM_OUT">
+            <PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
+            </Filters>
+        </Console>
        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <PatternLayout>
@@ -34,9 +72,14 @@
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
-            <AppenderRef ref="Console"/>
+            <AppenderRef ref="ConsoleInfo"/>
+            <AppenderRef ref="ConsoleWarn"/>
+            <AppenderRef ref="ConsoleError"/>
+            <AppenderRef ref="ConsoleFatal"/>
+            <AppenderRef ref="ProcessConsole"/>
            <AppenderRef ref="LogToFile"/>
        </Root>
    </Loggers>
--- a/code/common/service/resources/log4j2-test.xml
+++ b/code/common/service/resources/log4j2-test.xml
@@ -1,15 +1,50 @@
 <Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
    <Appenders>
-        <Console name="Console" target="SYSTEM_OUT">
-            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
+        <Console name="ConsoleInfo" target="SYSTEM_OUT">
+            <PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleWarn" target="SYSTEM_OUT">
+            <PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleError" target="SYSTEM_OUT">
+            <PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleFatal" target="SYSTEM_OUT">
+            <PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
+            </Filters>
        </Console>
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
-            <AppenderRef ref="Console"/>
-            <AppenderRef ref="LogToFile"/>
+            <AppenderRef ref="ConsoleInfo"/>
+            <AppenderRef ref="ConsoleWarn"/>
+            <AppenderRef ref="ConsoleError"/>
+            <AppenderRef ref="ConsoleFatal"/>
+            <AppenderRef ref="ProcessConsole"/>
        </Root>
    </Loggers>
 </Configuration>
--- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java
+++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
        return msgId;
    }

-    public void exportSampleData(int node, FileStorageId fid, int size, String name) {
+    public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
        channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
                .forNode(node)
                .run(RpcExportSampleData.newBuilder()
                        .setFileStorageId(fid.id())
                        .setSize(size)
+                        .setCtFilter(ctFilter)
                        .setName(name)
                        .build());
    }
--- a/code/execution/api/src/main/protobuf/executor-api.proto
+++ b/code/execution/api/src/main/protobuf/executor-api.proto
@@ -100,6 +100,7 @@ message RpcExportSampleData {
  int64 fileStorageId = 1;
  int32 size = 2;
  string name = 3;
+  string ctFilter = 4;
 }
 message RpcDownloadSampleData {
  string sampleSet = 1;
--- a/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
 import nu.marginalia.actor.state.ActorStep;
 import nu.marginalia.actor.state.Resume;
 import nu.marginalia.service.control.ServiceEventLog;
+import nu.marginalia.service.control.ServiceHeartbeat;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageId;
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.*;
+import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {

    private final FileStorageService storageService;
    private final ServiceEventLog eventLog;
+    private final ServiceHeartbeat heartbeat;
    private final Logger logger = LoggerFactory.getLogger(getClass());

    @Resume(behavior = ActorResumeBehavior.ERROR)
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {

                Files.deleteIfExists(Path.of(tarFileName));

-                try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
-                     var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
-                    is.transferTo(os);
+                HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
+
+                try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
+                    long size = urlConnection.getContentLengthLong();
+                    byte[] buffer = new byte[8192];
+
+                    try (var is = new BufferedInputStream(urlConnection.getInputStream());
+                         var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
+                        long copiedSize = 0;
+
+                        while (copiedSize < size) {
+                            int read = is.read(buffer);
+
+                            if (read < 0) // We've been promised a file of length 'size'
+                                throw new IOException("Unexpected end of stream");
+
+                            os.write(buffer, 0, read);
+                            copiedSize += read;
+
+                            // Update progress bar
+                            hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
+                        }
+                    }
+
                }
                catch (Exception ex) {
                    eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
                    logger.error("Error downloading sample", ex);
                    yield new Error();
                }
+                finally {
+                    urlConnection.disconnect();
+                }

                eventLog.logEvent(DownloadSampleActor.class, "Download complete");
                yield new Extract(fileStorageId, tarFileName);
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
    @Inject
    public DownloadSampleActor(Gson gson,
                               FileStorageService storageService,
-                               ServiceEventLog eventLog)
+                               ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
    {
        super(gson);
        this.storageService = storageService;
        this.eventLog = eventLog;
+        this.heartbeat = heartbeat;
    }

 }
--- a/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java
@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
    private final MqOutbox exportTasksOutbox;
    private final Logger logger = LoggerFactory.getLogger(getClass());

-    public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
-    public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
-        public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
-            this(crawlId, destId, size, name, -1);
+    public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
+    public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
+        public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
+            this(crawlId, destId, size, name, ctFilter,-1);
        }
    }

    @Override
    public ActorStep transition(ActorStep self) throws Exception {
        return switch(self) {
-            case Export(FileStorageId crawlId, int size, String name) -> {
+            case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
                var storage = storageService.allocateStorage(FileStorageType.EXPORT,
                        "crawl-sample-export",
                        "Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
                );

                if (storage == null) yield new Error("Bad storage id");
-                yield new Run(crawlId, storage.id(), size, name);
+                yield new Run(crawlId, storage.id(), size, ctFilter, name);
            }
-            case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
+            case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
                storageService.setFileStorageState(destId, FileStorageState.NEW);

-                long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
-                yield new Run(crawlId, destId, size, name, newMsgId);
+                long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
+                yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
            }
-            case Run(_, FileStorageId destId, _, _, long msgId) -> {
+            case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
                var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);

                if (rsp.state() != MqMessageState.OK) {
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {

    @Override
    public String describe() {
-        return "Export RSS/Atom feeds from crawl data";
+        return "Export sample crawl data";
    }

    @Inject
--- a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java
+++ b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
                    new ExportSampleDataActor.Export(
                            FileStorageId.of(request.getFileStorageId()),
                            request.getSize(),
+                            request.getCtFilter(),
                            request.getName()
                    )
            );
--- a/code/functions/live-capture/build.gradle
+++ b/code/functions/live-capture/build.gradle
@@ -25,9 +25,9 @@ dependencies {

    implementation project(':code:execution:api')
    implementation project(':code:processes:crawling-process:ft-content-type')
+    implementation project(':third-party:rssreader')

    implementation libs.jsoup
-    implementation project(':third-party:rssreader')
    implementation libs.opencsv
    implementation libs.slop
    implementation libs.sqlite
@@ -57,8 +57,6 @@ dependencies {
    implementation libs.bundles.gson
    implementation libs.bundles.mariadb

-
-
    testImplementation libs.bundles.slf4j.test
    testImplementation libs.bundles.junit
    testImplementation libs.mockito
--- a/code/functions/live-capture/java/nu/marginalia/domsample/DomSampleService.java
+++ b/code/functions/live-capture/java/nu/marginalia/domsample/DomSampleService.java
@@ -0,0 +1,119 @@
+package nu.marginalia.domsample;
+
+import com.google.inject.Inject;
+import com.zaxxer.hikari.HikariDataSource;
+import jakarta.inject.Named;
+import nu.marginalia.domsample.db.DomSampleDb;
+import nu.marginalia.livecapture.BrowserlessClient;
+import nu.marginalia.service.module.ServiceConfiguration;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+public class DomSampleService {
+    private final DomSampleDb db;
+    private final HikariDataSource mariadbDataSource;
+    private final URI browserlessURI;
+
+    private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
+
+    @Inject
+    public DomSampleService(DomSampleDb db,
+                            HikariDataSource mariadbDataSource,
+                            @Named("browserless-uri") String browserlessAddress,
+                            ServiceConfiguration serviceConfiguration)
+            throws URISyntaxException
+    {
+        this.db = db;
+        this.mariadbDataSource = mariadbDataSource;
+
+        if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
+            logger.warn("Live capture service will not run");
+            browserlessURI = null; // satisfy final
+        }
+        else {
+            browserlessURI = new URI(browserlessAddress);
+
+            Thread.ofPlatform().daemon().start(this::run);
+        }
+    }
+
+    public void syncDomains() {
+        Set<String> dbDomains = new HashSet<>();
+
+        logger.info("Fetching domains from database...");
+
+        try (var conn = mariadbDataSource.getConnection();
+            var stmt = conn.prepareStatement("""
+                SELECT DOMAIN_NAME 
+                FROM EC_DOMAIN 
+                WHERE NODE_AFFINITY>0
+                """)
+        ) {
+            var rs = stmt.executeQuery();
+            while (rs.next()) {
+                dbDomains.add(rs.getString("DOMAIN_NAME"));
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to sync domains", e);
+        }
+
+        logger.info("Found {} domains in database", dbDomains.size());
+
+        db.syncDomains(dbDomains);
+
+        logger.info("Synced domains to sqlite");
+    }
+
+    public void run() {
+
+        try (var client = new BrowserlessClient(browserlessURI)) {
+
+            while (!Thread.currentThread().isInterrupted()) {
+
+                try {
+                    // Grace sleep in case we're operating on an empty domain list
+                    TimeUnit.SECONDS.sleep(15);
+
+                    syncDomains();
+                    var domains = db.getScheduledDomains();
+
+                    for (var domain : domains) {
+                        updateDomain(client, domain);
+                    }
+                } catch (InterruptedException e) {
+                    Thread.currentThread().interrupt();
+                    logger.info("DomSampleService interrupted, stopping...");
+                    return;
+                } catch (Exception e) {
+                    logger.error("Error in DomSampleService run loop", e);
+                }
+            }
+
+        }
+    }
+
+    private void updateDomain(BrowserlessClient client, String domain) {
+        var rootUrl = "https://" + domain + "/";
+        try {
+            var content = client.annotatedContent(rootUrl,
+                    BrowserlessClient.GotoOptions.defaultValues());
+
+            if (content.isPresent()) {
+                db.saveSample(domain, rootUrl, content.get());
+            }
+        } catch (Exception e) {
+            logger.error("Failed to process domain: " + domain, e);
+        }
+        finally {
+            db.flagDomainAsFetched(domain);
+        }
+    }
+
+}
--- a/code/functions/live-capture/java/nu/marginalia/domsample/db/DomSampleDb.java
+++ b/code/functions/live-capture/java/nu/marginalia/domsample/db/DomSampleDb.java
@@ -0,0 +1,174 @@
+package nu.marginalia.domsample.db;
+
+import nu.marginalia.WmsaHome;
+import org.jsoup.Jsoup;
+
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+import java.util.*;
+
+public class DomSampleDb implements AutoCloseable {
+    private static final String dbFileName = "dom-sample.db";
+    private final Connection connection;
+
+    public DomSampleDb() throws SQLException{
+        this(WmsaHome.getDataPath().resolve(dbFileName));
+    }
+
+    public DomSampleDb(Path dbPath) throws SQLException {
+        String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
+
+        connection = DriverManager.getConnection(dbUrl);
+
+        try (var stmt = connection.createStatement()) {
+            stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
+            stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
+            stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
+        }
+    }
+
+    public void syncDomains(Set<String> domains) {
+        Set<String> currentDomains = new HashSet<>();
+        try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
+            var rs = stmt.executeQuery();
+            while (rs.next()) {
+                currentDomains.add(rs.getString("domain"));
+            }
+        } catch (SQLException e) {
+            throw new RuntimeException("Failed to sync domains", e);
+        }
+
+        Set<String> toRemove = new HashSet<>(currentDomains);
+        Set<String> toAdd = new HashSet<>(domains);
+
+        toRemove.removeAll(domains);
+        toAdd.removeAll(currentDomains);
+
+        try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
+                var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
+        ) {
+            for (String domain : toRemove) {
+                removeStmt.setString(1, domain);
+                removeStmt.executeUpdate();
+            }
+
+            for (String domain : toAdd) {
+                addStmt.setString(1, domain);
+                addStmt.executeUpdate();
+            }
+        } catch (SQLException e) {
+            throw new RuntimeException("Failed to remove domains", e);
+        }
+    }
+
+    public List<String> getScheduledDomains() {
+        List<String> domains = new ArrayList<>();
+        try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
+            var rs = stmt.executeQuery();
+            while (rs.next()) {
+                domains.add(rs.getString("domain"));
+            }
+        } catch (SQLException e) {
+            throw new RuntimeException("Failed to get scheduled domains", e);
+        }
+        return domains;
+    }
+
+    public void flagDomainAsFetched(String domain) {
+        try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
+            stmt.setString(1, domain);
+            stmt.executeUpdate();
+        } catch (SQLException e) {
+            throw new RuntimeException("Failed to flag domain as fetched", e);
+        }
+    }
+
+
+    public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
+
+    public List<Sample> getSamples(String domain) throws SQLException {
+        List<Sample> samples = new ArrayList<>();
+
+        try (var stmt = connection.prepareStatement("""
+                SELECT url, sample, requests, accepted_popover
+                FROM samples 
+                WHERE domain = ?
+                """))
+        {
+            stmt.setString(1, domain);
+            var rs = stmt.executeQuery();
+            while (rs.next()) {
+                samples.add(
+                        new Sample(
+                                rs.getString("url"),
+                                domain,
+                                rs.getString("sample"),
+                                rs.getString("requests"),
+                                rs.getBoolean("accepted_popover")
+                        )
+                );
+            }
+        }
+        return samples;
+    }
+
+    public void saveSample(String domain, String url, String rawContent) throws SQLException {
+        var doc = Jsoup.parse(rawContent);
+
+        var networkRequests = doc.getElementById("marginalia-network-requests");
+
+        boolean acceptedPopover = false;
+
+        StringBuilder requestTsv = new StringBuilder();
+        if (networkRequests != null) {
+
+            acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
+
+            for (var request : networkRequests.getElementsByClass("network-request")) {
+                String method = request.attr("data-method");
+                String urlAttr = request.attr("data-url");
+                String timestamp = request.attr("data-timestamp");
+
+                requestTsv
+                        .append(method)
+                        .append('\t')
+                        .append(timestamp)
+                        .append('\t')
+                        .append(urlAttr.replace('\n', ' '))
+                        .append("\n");
+            }
+
+            networkRequests.remove();
+        }
+
+        doc.body().removeAttr("id");
+
+        String sample = doc.html();
+
+        saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
+
+    }
+
+    record Request(String url, String method, String timestamp, boolean acceptedPopover) {}
+
+    public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
+        try (var stmt = connection.prepareStatement("""
+                INSERT OR REPLACE 
+                INTO samples (domain, url, sample, requests, accepted_popover) 
+                VALUES (?, ?, ?, ?, ?)
+                """)) {
+            stmt.setString(1, domain);
+            stmt.setString(2, url);
+            stmt.setString(3, sample);
+            stmt.setString(4, requests);
+            stmt.setBoolean(5, acceptedPopover);
+            stmt.executeUpdate();
+        }
+    }
+
+    public void close() throws SQLException {
+        connection.close();
+    }
+}
--- a/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
+++ b/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
@@ -8,10 +8,13 @@ import org.slf4j.LoggerFactory;

 import java.io.IOException;
 import java.net.URI;
+import java.net.URLEncoder;
 import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
+import java.nio.charset.StandardCharsets;
 import java.time.Duration;
+import java.util.List;
 import java.util.Map;
 import java.util.Optional;

@@ -60,6 +63,42 @@ public class BrowserlessClient implements AutoCloseable {
        return Optional.of(rsp.body());
    }

+    /** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
+     * certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
+     */
+    public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
+        Map<String, Object> requestData = Map.of(
+                "url", url,
+                "userAgent", userAgent,
+                "gotoOptions", gotoOptions,
+                "waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
+        );
+
+        // Launch parameters for the browserless instance to load the extension
+        Map<String, Object> launchParameters = Map.of(
+                "args", List.of("--load-extension=/dom-export")
+        );
+
+        String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
+
+        var request = HttpRequest.newBuilder()
+                .uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
+                .method("POST", HttpRequest.BodyPublishers.ofString(
+                        gson.toJson(requestData)
+                ))
+                .header("Content-type", "application/json")
+                .build();
+
+        var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
+
+        if (rsp.statusCode() >= 300) {
+            logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
+            return Optional.empty();
+        }
+
+        return Optional.of(rsp.body());
+    }
+
    public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
            throws IOException, InterruptedException {

@@ -102,7 +141,7 @@ public class BrowserlessClient implements AutoCloseable {

    public record GotoOptions(String waitUntil, long timeout) {
        public static GotoOptions defaultValues() {
-            return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis());
+            return new GotoOptions("load", Duration.ofSeconds(10).toMillis());
        }
    }

--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@@ -229,13 +229,15 @@ public class FeedFetcherService {
                    .timeout(Duration.ofSeconds(15))
                    ;

-            if (ifModifiedSinceDate != null) {
+            // Set the If-Modified-Since or If-None-Match headers if we have them
+            // though since there are certain idiosyncrasies in server implementations,
+            // we avoid setting both at the same time as that may turn a 304 into a 200.
+            if (ifNoneMatchTag != null) {
+                requestBuilder.header("If-None-Match", ifNoneMatchTag);
+            } else if (ifModifiedSinceDate != null) {
                requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
            }

-            if (ifNoneMatchTag != null) {
-                requestBuilder.header("If-None-Match", ifNoneMatchTag);
-            }

            HttpRequest getRequest = requestBuilder.build();

--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/SimpleFeedParser.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/SimpleFeedParser.java
@@ -79,9 +79,17 @@ public class SimpleFeedParser {
                if (!link.isBlank())
                    break;
                var tag = element.getElementsByTag(attr).first();
+
                if (tag != null) {
-                    link = tag.text();
+                    String linkText = tag.text();
+
+                    if (linkText.isBlank()) {
+                        linkText = tag.attr("href");
+                    }
+
+                    link = linkText;
                }
+
            }

            ret.add(new ItemData(title, description, link, pubDate));
--- a/code/functions/live-capture/test/nu/marginalia/domsample/db/DomSampleDbTest.java
+++ b/code/functions/live-capture/test/nu/marginalia/domsample/db/DomSampleDbTest.java
@@ -0,0 +1,113 @@
+package nu.marginalia.domsample.db;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.*;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class DomSampleDbTest {
+    Path tempDir;
+
+    @BeforeEach
+    void setUp() throws Exception {
+        tempDir = Files.createTempDirectory("test");
+    }
+
+    @AfterEach
+    void tearDown() throws IOException {
+        FileUtils.deleteDirectory(tempDir.toFile());
+    }
+
+    @Test
+    public void testSetUp() {
+        var dbPath = tempDir.resolve("test.db");
+        try (var db = new DomSampleDb(dbPath)) {
+        }
+        catch (Exception e) {
+            fail("Failed to set up database: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testSyncDomains() {
+        var dbPath = tempDir.resolve("test.db");
+        try (var db = new DomSampleDb(dbPath)) {
+
+            db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
+            assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
+            db.syncDomains(Set.of("example.com", "test.com"));
+            assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
+            db.syncDomains(Set.of("foobar.com", "test.com"));
+            assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
+        }
+        catch (Exception e) {
+            fail("Failed to sync domains: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void testFetchDomains() {
+        var dbPath = tempDir.resolve("test.db");
+        try (var db = new DomSampleDb(dbPath)) {
+
+            db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
+            db.flagDomainAsFetched("example.com");
+            db.flagDomainAsFetched("test.com");
+            db.flagDomainAsFetched("foobar.com");
+            assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
+            db.flagDomainAsFetched("test.com");
+            assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
+        }
+        catch (Exception e) {
+            fail("Failed to sync domains: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void saveLoadSingle() {
+        var dbPath = tempDir.resolve("test.db");
+        try (var db = new DomSampleDb(dbPath)) {
+            db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
+            var samples = db.getSamples("example.com");
+            assertEquals(1, samples.size());
+            var sample = samples.getFirst();
+            assertEquals("example.com", sample.domain());
+            assertEquals("http://example.com/sample", sample.url());
+            assertEquals("sample data", sample.sample());
+            assertEquals("requests data", sample.requests());
+            assertTrue(sample.acceptedPopover());
+        }
+        catch (Exception e) {
+            fail("Failed to save/load sample: " + e.getMessage());
+        }
+    }
+
+    @Test
+    public void saveLoadTwo() {
+        var dbPath = tempDir.resolve("test.db");
+        try (var db = new DomSampleDb(dbPath)) {
+            db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
+            db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
+            var samples = db.getSamples("example.com");
+            assertEquals(2, samples.size());
+
+            Map<String, String> samplesByUrl = new HashMap<>();
+            for (var sample : samples) {
+                samplesByUrl.put(sample.url(), sample.sample());
+            }
+
+            assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
+            assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
+        }
+        catch (Exception e) {
+            fail("Failed to save/load sample: " + e.getMessage());
+        }
+    }
+}
--- a/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java
+++ b/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java
@@ -3,17 +3,21 @@ package nu.marginalia.livecapture;
 import com.github.tomakehurst.wiremock.WireMockServer;
 import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
 import nu.marginalia.WmsaHome;
+import nu.marginalia.domsample.db.DomSampleDb;
 import nu.marginalia.service.module.ServiceConfigurationModule;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.images.PullPolicy;
 import org.testcontainers.junit.jupiter.Testcontainers;
 import org.testcontainers.utility.DockerImageName;

 import java.io.IOException;
 import java.net.URI;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Map;

 import static com.github.tomakehurst.wiremock.client.WireMock.*;
@@ -22,9 +26,14 @@ import static com.github.tomakehurst.wiremock.client.WireMock.*;
@Testcontainers
@Tag("slow")
 public class BrowserlessClientTest {
-    static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
+    // Run gradle docker if this image is not available
+    static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
            .withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
+            .withImagePullPolicy(PullPolicy.defaultPolicy())
            .withNetworkMode("bridge")
+            .withLogConsumer(frame -> {
+                System.out.print(frame.getUtf8String());
+            })
            .withExposedPorts(3000);

    static WireMockServer wireMockServer =
@@ -34,6 +43,7 @@ public class BrowserlessClientTest {
    static String localIp;

    static URI browserlessURI;
+    static URI browserlessWssURI;

    @BeforeAll
    public static void setup() throws IOException {
@@ -44,6 +54,12 @@ public class BrowserlessClientTest {
                container.getMappedPort(3000))
        );

+        browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
+                container.getHost(),
+                container.getMappedPort(3000))
+        );
+
+
        wireMockServer.start();
        wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));

@@ -85,6 +101,30 @@ public class BrowserlessClientTest {
        }
    }

+    @Test
+    public void testAnnotatedContent() throws Exception {
+
+        try (var client = new BrowserlessClient(browserlessURI);
+             DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
+        ) {
+            var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
+            dbop.saveSample("marginalia.nu", "https://www.thesodacanstove.com/alcohol-stove/how-to-build/", content);
+            System.out.println(content);
+            Assertions.assertFalse(content.isBlank(), "Content should not be empty");
+
+            dbop.getSamples("marginalia.nu").forEach(sample -> {
+                System.out.println("Sample URL: " + sample.url());
+                System.out.println("Sample Content: " + sample.sample());
+                System.out.println("Sample Requests: " + sample.requests());
+                System.out.println("Accepted Popover: " + sample.acceptedPopover());
+            });
+        }
+        finally {
+            Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
+        }
+
+    }
+
    @Test
    public void testScreenshot() throws Exception {
        try (var client = new BrowserlessClient(browserlessURI)) {
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
@@ -1,6 +1,7 @@
 package nu.marginalia.api.searchquery.model.results;

 import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import org.jetbrains.annotations.NotNull;

@@ -161,4 +162,14 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
    public String toString() {
        return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
    }
+
+    public String getShortFormat() {
+        try {
+            var df = DocumentFormat.valueOf(format);
+            return df.shortFormat;
+        }
+        catch (IllegalArgumentException e) {
+            return DocumentFormat.UNKNOWN.shortFormat;
+        }
+    }
 }
--- a/code/index/index-forward/java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java
+++ b/code/index/index-forward/java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {

            LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());

-            ByteBuffer workArea = ByteBuffer.allocate(65536);
+            ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
            for (var instance : journal.pages()) {
                try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
                {
--- a/code/processes/converting-process/build.gradle
+++ b/code/processes/converting-process/build.gradle
@@ -62,6 +62,7 @@ dependencies {
    implementation libs.jwarc

    implementation libs.jsoup
+    implementation libs.pdfbox

    implementation libs.guava
    implementation dependencies.create(libs.guice.get()) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java
@@ -1,8 +1,8 @@
 package nu.marginalia.converting.model;

+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentMetadata;

 import javax.annotation.Nullable;
@@ -21,7 +21,7 @@ public class ProcessedDocumentDetails {
    public long hashCode;

    public Set<HtmlFeature> features;
-    public HtmlStandard standard;
+    public DocumentFormat format;

    public List<EdgeUrl> linksInternal;
    public List<EdgeUrl> linksExternal;
@@ -30,6 +30,6 @@ public class ProcessedDocumentDetails {
    public GeneratorType generator;

    public String toString() {
-        return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
+        return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.format + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
    }
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
 import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
 import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
+import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
 import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
 import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.model.EdgeDomain;
@@ -33,7 +34,8 @@ public class DocumentProcessor {
    private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
            "application/xhtml",
            "text/html",
-            "text/plain");
+            "text/plain",
+            "application/pdf");


    private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
@@ -42,12 +44,14 @@ public class DocumentProcessor {
    @Inject
    public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
                             PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
+                             PdfDocumentProcessorPlugin pdfDocumentProcessorPlugin,
                             AnchorTextKeywords anchorTextKeywords)
    {
        this.anchorTextKeywords = anchorTextKeywords;

        processorPlugins.add(htmlDocumentProcessorPlugin);
        processorPlugins.add(plainTextDocumentProcessorPlugin);
+        processorPlugins.add(pdfDocumentProcessorPlugin);
    }

    public ProcessedDocument process(CrawledDocument crawledDocument,
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.logic;

 import crawlercommons.utils.Strings;
 import nu.marginalia.converting.model.DisqualifiedException;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -17,7 +17,7 @@ import java.util.Set;
 public class DocumentValuator {

    public double getQuality(CrawledDocument crawledDocument,
-                             HtmlStandard htmlStandard,
+                             DocumentFormat htmlStandard,
                             Document parsedDocument,
                             int textLength) throws DisqualifiedException {

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
@@ -1,7 +1,7 @@
 package nu.marginalia.converting.processor.logic;

 import com.google.common.base.Strings;
-import nu.marginalia.model.html.HtmlStandard;
+import nu.marginalia.model.DocumentFormat;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.DocumentType;
 import org.slf4j.Logger;
@@ -12,54 +12,54 @@ public class HtmlStandardExtractor {

    private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);

-    public static HtmlStandard parseDocType(DocumentType docType) {
+    public static DocumentFormat parseDocType(DocumentType docType) {
        if (null == docType) {
-            return HtmlStandard.UNKNOWN;
+            return DocumentFormat.UNKNOWN;
        }

        String publicId = docType.publicId();
        if (Strings.isNullOrEmpty(publicId))
-            return HtmlStandard.HTML5;
+            return DocumentFormat.HTML5;

        publicId = publicId.toUpperCase();
        if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;
        }
        if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        }
        if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//SQ//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML//EN"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-/W3C//DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-/W3C/DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//W3C//DTD XHTML"))
-            return HtmlStandard.XHTML;
+            return DocumentFormat.XHTML;
        if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
-            return HtmlStandard.XHTML;
+            return DocumentFormat.XHTML;
        if (publicId.startsWith("-//W3C//DTD HTML"))
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;

        logger.debug("Unknown publicID standard {}", publicId);
-        return HtmlStandard.UNKNOWN;
+        return DocumentFormat.UNKNOWN;
    }

-    public static HtmlStandard sniffHtmlStandard(Document parsed) {
+    public static DocumentFormat sniffHtmlStandard(Document parsed) {
        int html4Attributes = 0;
        int html5Attributes = 0;

@@ -73,11 +73,11 @@ public class HtmlStandardExtractor {
            html4Attributes++;
        }
        if (html5Attributes > 0) {
-            return HtmlStandard.HTML5;
+            return DocumentFormat.HTML5;
        }
        if (html4Attributes > 0) {
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;
        }
-        return HtmlStandard.HTML123;
+        return DocumentFormat.HTML123;
    }
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
@@ -7,11 +7,11 @@ import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.filter.LanguageFilter;
 import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;

 import javax.annotation.Nullable;
 import java.io.IOException;
@@ -73,7 +73,7 @@ public abstract class AbstractDocumentProcessorPlugin {
            return this;
        }

-        public MetaTagsBuilder addFormat(HtmlStandard standard) {
+        public MetaTagsBuilder addFormat(DocumentFormat standard) {

            add("format", standard);

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
@@ -25,12 +25,12 @@ import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
 import nu.marginalia.link_parser.FeedExtractor;
 import nu.marginalia.link_parser.LinkParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import org.jsoup.nodes.Document;
@@ -137,8 +137,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin


        final int length = getLength(doc);
-        final HtmlStandard standard = getHtmlStandard(doc);
-        final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
+        final DocumentFormat format = getDocumentFormat(doc);
+        final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);

        if (isDisqualified(documentClass, url, quality, doc.title())) {
            throw new DisqualifiedException(DisqualificationReason.QUALITY);
@@ -152,7 +152,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        var ret = new ProcessedDocumentDetails();

        ret.length = length;
-        ret.standard = standard;
+        ret.format = format;
        ret.title = specialization.getTitle(doc, dld, crawledDocument.url);

        final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
@@ -161,7 +161,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        ret.quality = documentValuator.adjustQuality(quality, features);
        ret.hashCode = dld.localitySensitiveHashCode();

-        PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
+        PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, format, true);

        EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());

@@ -180,7 +180,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
                .addPubDate(pubDate)
                .addUrl(url)
                .addFeatures(features)
-                .addFormat(standard)
+                .addFormat(format)
                .addGenerator(generatorParts.keywords())
                .build();

@@ -316,12 +316,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        return linkTerms;
    }

-    private HtmlStandard getHtmlStandard(Document doc) {
-        HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
-        if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
+    private DocumentFormat getDocumentFormat(Document doc) {
+        DocumentFormat format = HtmlStandardExtractor.parseDocType(doc.documentType());
+        if (DocumentFormat.UNKNOWN.equals(format)) {
            return HtmlStandardExtractor.sniffHtmlStandard(doc);
        }
-        return htmlStandard;
+        return format;
    }

    private int getLength(Document doc) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPlugin.java
@@ -0,0 +1,286 @@
+package nu.marginalia.converting.processor.plugin;
+
+import com.google.inject.Inject;
+import com.google.inject.name.Named;
+import nu.marginalia.converting.model.DisqualifiedException;
+import nu.marginalia.converting.model.ProcessedDocumentDetails;
+import nu.marginalia.converting.processor.DocumentClass;
+import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
+import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
+import nu.marginalia.keyword.DocumentKeywordExtractor;
+import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
+import nu.marginalia.language.filter.LanguageFilter;
+import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
+import nu.marginalia.model.EdgeUrl;
+import nu.marginalia.model.crawl.HtmlFeature;
+import nu.marginalia.model.crawl.PubDate;
+import nu.marginalia.model.crawldata.CrawledDocument;
+import nu.marginalia.model.idx.DocumentFlags;
+import nu.marginalia.model.idx.DocumentMetadata;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.text.HeadingAwarePDFTextStripper;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.time.LocalDate;
+import java.util.*;
+
+
+public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
+
+    private final int maxTitleLength;
+    private final DocumentKeywordExtractor keywordExtractor;
+    private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
+    private final DocumentLengthLogic documentLengthLogic;
+    private final DefaultSpecialization defaultSpecialization;
+
+    private static final Logger logger = LoggerFactory.getLogger(PdfDocumentProcessorPlugin.class);
+
+    @Inject
+    public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
+                                      LanguageFilter languageFilter,
+                                      ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
+                                      DocumentKeywordExtractor keywordExtractor,
+                                      DocumentLengthLogic documentLengthLogic,
+                                      DefaultSpecialization defaultSpecialization)
+
+    {
+        super(languageFilter);
+        this.sentenceExtractorProvider = sentenceExtractorProvider;
+        this.documentLengthLogic = documentLengthLogic;
+        this.maxTitleLength = maxTitleLength;
+        this.keywordExtractor = keywordExtractor;
+        this.defaultSpecialization = defaultSpecialization;
+    }
+
+    @Override
+    public boolean isApplicable(CrawledDocument doc) {
+        String contentType = doc.contentType.toLowerCase();
+
+        if (contentType.equals("application/pdf"))
+            return true;
+        if (contentType.startsWith("application/pdf;")) // charset=blabla
+            return true;
+
+        return false;
+    }
+
+    @Override
+    public DetailsWithWords createDetails(CrawledDocument crawledDocument,
+                                          LinkTexts linkTexts,
+                                          DocumentClass documentClass)
+            throws DisqualifiedException, URISyntaxException, IOException {
+
+        String documentBody = crawledDocument.documentBody();
+
+        if (languageFilter.isBlockedUnicodeRange(documentBody)) {
+            throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
+        }
+
+        final EdgeUrl url = new EdgeUrl(crawledDocument.url);
+
+
+        Document doc;
+        try {
+            doc = convertPdfToHtml(crawledDocument.documentBodyBytes);
+        } catch (IOException e) {
+            logger.error("Failed to convert PDF file {} - {}", url, e.getMessage());
+            throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ERROR);
+        }
+
+        DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
+
+        checkDocumentLanguage(dld);
+
+        documentLengthLogic.validateLength(dld, 1.0);
+
+        var ret = new ProcessedDocumentDetails();
+
+        ret.length = documentBody.length();
+
+        ret.format = DocumentFormat.PDF;
+        ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
+
+        ret.quality = -5;
+
+        ret.features = Set.of(HtmlFeature.PDF);
+        ret.description = getDescription(doc);
+        ret.hashCode = dld.localitySensitiveHashCode();
+
+        final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
+
+        EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PdfFile);
+
+        ret.metadata = new DocumentMetadata(
+                documentLengthLogic.getEncodedAverageLength(dld),
+                pubDate.yearByte(),
+                (int) -ret.quality,
+                documentFlags);
+
+        DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
+
+        var tagWords = new MetaTagsBuilder()
+                .addPubDate(pubDate)
+                .addUrl(url)
+                .addFeatures(ret.features)
+                .addFormat(ret.format)
+                .build();
+
+        words.addAllSyntheticTerms(tagWords);
+
+        if (pubDate.hasYear()) {
+            ret.pubYear = pubDate.year();
+        }
+
+        /* These are assumed to be populated */
+        ret.linksInternal = new ArrayList<>();
+        ret.linksExternal = new ArrayList<>();
+
+        return new DetailsWithWords(ret, words);
+    }
+
+    private String getDescription(Document doc) {
+        int cnt = 0;
+        boolean useNext = false;
+        for (var ptag : doc.getElementsByTag("p")) {
+            String text = ptag.text();
+
+            // Many academic documents have an abstract at the start of the document,
+            // which makes a nice summary.  Though they tend to bleed into the text,
+            // so we check for the word "Abstract" at the start of the paragraph.
+
+            if (text.startsWith("Abstract ")) {
+                return StringUtils.abbreviate(text.substring("Abstract ".length()), "...", 255);
+            }
+            else if (text.equals("Abstract")) {
+                useNext = true;
+            }
+            else if (useNext) {
+                return StringUtils.abbreviate(text, "...", 255);
+            }
+
+            if (++cnt > 15) { // Don't scan the entire document
+                break;
+            }
+        }
+
+        // Fall back to the default specialization
+        return defaultSpecialization.getSummary(doc, Set.of());
+
+    }
+
+    /** Convert the provided PDF bytes into a HTML rendering that can be fed
+     * to the HTML processor.
+     */
+    Document convertPdfToHtml(byte[] pdfBytes) throws IOException {
+        try (var doc = Loader.loadPDF(pdfBytes)) {
+            String docMetaTitle = Objects.requireNonNullElse(doc.getDocumentInformation().getTitle(), "");
+
+            var stripper = new HeadingAwarePDFTextStripper();
+            stripper.setStartPage(1);
+            stripper.setSortByPosition(true);
+            stripper.setWordSeparator(" ");
+
+            // Increase the tolerance for line spacing to deal better with paragraphs.
+            stripper.setDropThreshold(5f);
+
+            stripper.setPageStart("<div>");
+            stripper.setParagraphStart("<p>");
+            stripper.setParagraphEnd("</p>\n");
+            stripper.setPageEnd("</div>\n");
+            stripper.setHeadingStart("<h1>");
+            stripper.setHeadingEnd("</h1>\n");
+            stripper.setLineSeparator("\n");
+
+            String text = stripper.getText(doc);
+
+            StringBuilder htmlBuilder = new StringBuilder(text.length() + 1024);
+            htmlBuilder.append("<html><body>")
+                    .append(text)
+                    .append("</body></html>");
+
+            var parsed = Jsoup.parse(htmlBuilder.toString());
+
+            repairDOM(parsed);
+
+            for (var heading : parsed.getElementsByTag("h1")) {
+                String headingText = heading.text();
+                if (headingText.length() > 2) {
+                    parsed.title(headingText);
+                    break;
+                }
+            }
+
+
+            if (parsed.title().isEmpty()) {
+                // Prefer setting the title to the first paragraph in the
+                // document, as this is almost always correct.  Otherwise,
+                // we fall back on the metadata title, which is almost always
+                // useless
+
+                var firstP = parsed.getElementsByTag("p").first();
+                if (firstP != null) parsed.title(firstP.text());
+                else parsed.title(docMetaTitle);
+            }
+            return parsed;
+        }
+
+
+    }
+
+    /** Repair the DOM to remove some common issues with PDF conversion,
+     * including empty paragraphs, and multiline headers that are split into multiple
+     * conescutive h1 tags.
+     */
+    private void repairDOM(Document parsed) {
+
+        // <p><h1>...</h1></p> -> <h1>...</h1>
+        parsed.getElementsByTag("h1").forEach(h1 -> {
+            var parent = h1.parent();
+            if (parent == null || !"p".equals(parent.tagName())) {
+                return;
+            }
+
+            if (parent.childrenSize() == 1) {
+                parent.replaceWith(h1);
+            }
+        });
+
+        // Remove empty <p> tags
+        parsed.getElementsByTag("p").forEach(p -> {
+            if (p.childrenSize() == 0 && !p.hasText()) {
+                p.remove();
+            }
+        });
+
+        // <h1>...</h1><h1>...</h1> -> <h1>...</h1>
+        parsed.getElementsByTag("h1").forEach(h1 -> {
+            var nextSibling = h1.nextElementSibling();
+            if (nextSibling == null || !"h1".equals(nextSibling.tagName())) {
+                return; // Short-circuit to avoid unnecessary work
+            }
+
+            StringJoiner joiner = new StringJoiner(" ");
+            joiner.add(h1.text());
+
+            for (var sibling : h1.nextElementSiblings()) {
+                if (!"h1".equals(sibling.tagName()))
+                    break;
+                joiner.add(sibling.text());
+                sibling.remove();
+            }
+
+            h1.text(joiner.toString());
+        });
+
+    }
+
+}
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java
@@ -13,10 +13,10 @@ import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.filter.LanguageFilter;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import org.apache.commons.lang3.StringUtils;
@@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP

        ret.length = documentBody.length();

-        ret.standard = HtmlStandard.PLAIN;
+        ret.format = DocumentFormat.PLAIN;
        ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);

        ret.quality = -1;
@@ -113,7 +113,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
                .addPubDate(pubDate)
                .addUrl(url)
                .addFeatures(ret.features)
-                .addFormat(ret.standard)
+                .addFormat(ret.format)
                .build();

        words.addAllSyntheticTerms(tagWords);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java
@@ -1,12 +1,13 @@
 package nu.marginalia.converting.processor.pubdate;

-import nu.marginalia.model.html.HtmlStandard;
+import nu.marginalia.model.DocumentFormat;

 public class PubDateFromHtmlStandard {
    /** Used to bias pub date heuristics */
-    public static int blindGuess(HtmlStandard standard) {
-        return switch (standard) {
+    public static int blindGuess(DocumentFormat format) {
+        return switch (format) {
            case PLAIN -> 1993;
+            case PDF -> 2010;
            case HTML123 -> 1997;
            case HTML4, XHTML -> 2006;
            case HTML5 -> 2018;
@@ -21,8 +22,8 @@ public class PubDateFromHtmlStandard {
     * Discovering publication year involves a lot of guesswork, this helps
     * keep the guesses relatively sane.
     */
-    public static boolean isGuessPlausible(HtmlStandard standard, int year) {
-        switch (standard) {
+    public static boolean isGuessPlausible(DocumentFormat format, int year) {
+        switch (format) {
            case HTML123:
                return year <= 2000;
            case XHTML:
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java
@@ -1,14 +1,14 @@
 package nu.marginalia.converting.processor.pubdate;

 import nu.marginalia.converting.model.DocumentHeaders;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;

 public interface PubDateHeuristic {

-    Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
+    Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard);
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java
@@ -1,7 +1,7 @@
 package nu.marginalia.converting.processor.pubdate;

+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;

 import java.time.DateTimeException;
 import java.time.LocalDate;
@@ -26,7 +26,7 @@ public class PubDateParser {
                .filter(PubDateParser::validateDate);
    }

-    public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
+    public static Optional<PubDate> attemptParseDate(String date, DocumentFormat standard) {
        return Optional.ofNullable(date)
                .filter(str -> str.length() >= 4 && str.length() < 32)
                .flatMap(str ->
@@ -81,7 +81,7 @@ public class PubDateParser {
    }


-    public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
+    public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, DocumentFormat standard) {
        int guess = PubDateFromHtmlStandard.blindGuess(standard);

        var matcher = yearPattern.matcher(maybe);
@@ -135,7 +135,7 @@ public class PubDateParser {
        return (max + min) / 2;
    }

-    public static int guessYear(HtmlStandard standard) {
+    public static int guessYear(DocumentFormat standard) {
        // Create some jitter to avoid having documents piling up in the same four years
        // as this would make searching in those years disproportionately useless

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.pubdate;

 import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.heuristic.*;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.ArrayList;
@@ -38,7 +38,7 @@ public class PubDateSniffer {
        heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
    }

-    public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
+    public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard, boolean runExpensive) {
        final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;

        for (var heuristic : heuristics) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -19,7 +19,7 @@ import java.util.Optional;
 public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        if (effortLevel == PubDateEffortLevel.LOW)
            return Optional.empty();

@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {

    private static class DateExtractingNodeVisitorPass implements NodeFilter {
        public PubDate pubDate;
-        private final HtmlStandard htmlStandard;
+        private final DocumentFormat htmlStandard;

-        private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
+        private DateExtractingNodeVisitorPass(DocumentFormat htmlStandard) {
            this.htmlStandard = htmlStandard;
        }

@@ -135,7 +135,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
        }

        private void parse(String text) {
-            if (htmlStandard == HtmlStandard.UNKNOWN) {
+            if (htmlStandard == DocumentFormat.UNKNOWN) {
                PubDateParser
                        .dateFromHighestYearLookingSubstring(text)
                        .ifPresent(this::setPubDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
@@ -5,9 +5,9 @@ import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;
@@ -19,7 +19,7 @@ import java.util.Optional;
 public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        if (effortLevel == PubDateEffortLevel.LOW)
            return Optional.empty();

@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {

    private static class DateExtractingNodeVisitor implements NodeFilter {
        public PubDate pubDate;
-        private final HtmlStandard htmlStandard;
+        private final DocumentFormat htmlStandard;

-        private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
+        private DateExtractingNodeVisitor(DocumentFormat htmlStandard) {
            this.htmlStandard = htmlStandard;
        }

@@ -73,7 +73,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
        }

        private void parse(String text) {
-            if (htmlStandard == HtmlStandard.UNKNOWN) {
+            if (htmlStandard == DocumentFormat.UNKNOWN) {
                PubDateParser
                        .dateFromHighestYearLookingSubstring(text)
                        .ifPresent(this::setPubDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,8 +14,8 @@ import java.util.Optional;
 public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
-        if (htmlStandard == HtmlStandard.UNKNOWN)
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
+        if (htmlStandard == DocumentFormat.UNKNOWN)
            return Optional.empty();

        return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // HTML5, alternative approach
        for (var tag : document.select("time")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // HTML5
        for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
            if (maybeDate.isPresent()) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java
@@ -8,9 +8,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Collections;
@@ -21,7 +21,7 @@ import java.util.Optional;
 public class PubDateHeuristicJSONLD implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("script[type=\"application/ld+json\"]")) {
            var maybeDate = parseLdJson(tag.data())
                    .flatMap(PubDateParser::attemptParseDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.List;
@@ -15,7 +15,7 @@ import java.util.Optional;
 public class PubDateHeuristicLastModified implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        List<String> lastModified = headers.get("last-modified");
        if (lastModified.isEmpty())
            return Optional.empty();
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicMicrodata implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {

        for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicOpenGraph implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // OG
        for (var tag : document.select("meta[property=\"article:published_time\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicRDFaTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("meta[property=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
            if (maybeDate.isPresent()) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -21,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
    private static final int MIN_URL_PATTERN_YEAR = 2000;

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        final String urlString = url.path;

        var matcher = yearUrlPattern.matcher(urlString);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -19,7 +19,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {

    @Override
    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
-                                   Document document, HtmlStandard htmlStandard) {
+                                   Document document, DocumentFormat htmlStandard) {
        final String urlString = url.path;

        var matcher = yearUrlPattern.matcher(urlString);
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
@@ -8,12 +8,12 @@ import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.DocumentClass;
 import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
 import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawl.UrlIndexingState;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import nu.marginalia.model.idx.WordFlags;
@@ -53,6 +53,7 @@ public class SideloaderProcessing {
                "",
                body.getBytes(StandardCharsets.UTF_8),
                false,
+                -1,
                null,
                null
        );
@@ -83,7 +84,7 @@ public class SideloaderProcessing {
            // that we can't get from the sideloaded data since it's
            // so stripped down

-            ret.details.standard = HtmlStandard.HTML5;
+            ret.details.format = DocumentFormat.HTML5;
            ret.details.pubYear = pubYear;
            ret.details.features.add(HtmlFeature.JS);
            ret.details.features.add(HtmlFeature.TRACKING);
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
@@ -9,13 +9,13 @@ import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
 import nu.marginalia.keyword.DocumentKeywordExtractor;
 import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.DomainIndexingState;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawl.UrlIndexingState;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import nu.marginalia.model.idx.WordFlags;
@@ -165,7 +165,7 @@ public class StackexchangeSideloader implements SideloadSource {
            ret.details.description = StringUtils.truncate(doc.body().text(), 255);
            ret.details.length = 128;

-            ret.details.standard = HtmlStandard.HTML5;
+            ret.details.format = DocumentFormat.HTML5;
            ret.details.linksExternal = List.of();
            ret.details.linksInternal = List.of();
            ret.state = UrlIndexingState.OK;
--- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
@@ -124,7 +124,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
                    document.details.title,
                    document.details.description,
                    HtmlFeature.encode(document.details.features),
-                    document.details.standard.name(),
+                    document.details.format.name(),
                    document.details.length,
                    document.details.hashCode,
                    (float) document.details.quality,
--- a/code/processes/converting-process/java/org/apache/pdfbox/text/HeadingAwarePDFTextStripper.java
+++ b/code/processes/converting-process/java/org/apache/pdfbox/text/HeadingAwarePDFTextStripper.java
--- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java
@@ -6,6 +6,7 @@ import com.google.inject.Injector;
 import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.io.SerializableCrawlDataStream;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.crawl.DomainIndexingState;
 import nu.marginalia.model.crawl.PubDate;
@@ -13,7 +14,6 @@ import nu.marginalia.model.crawl.UrlIndexingState;
 import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
-import nu.marginalia.model.html.HtmlStandard;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
@@ -91,7 +91,7 @@ public class ConvertingIntegrationTest {

            assertTrue(details.title.length() > 4);
            assertTrue(details.description.length() > 4);
-            assertEquals(HtmlStandard.HTML5, details.standard);
+            assertEquals(DocumentFormat.HTML5, details.format);

        }
    }
@@ -125,7 +125,7 @@ public class ConvertingIntegrationTest {
            assertTrue(details.metadata.size() > 0);
            assertTrue(details.title.length() > 4);
            assertTrue(details.description.length() > 4);
-            assertEquals(HtmlStandard.HTML5, details.standard);
+            assertEquals(DocumentFormat.HTML5, details.format);
        }
    }

@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
                    "",
                    readClassPathFile(p.toString()).getBytes(),
                    false,
+                    -1,
                    null,
                    null
                    );
--- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
@@ -20,7 +20,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
 import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
 import org.junit.jupiter.api.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
    private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
        List<SerializableCrawlData> data = new ArrayList<>();

-        try (var recorder = new WarcRecorder(fileName, new BasicCookieStore());
+        try (var recorder = new WarcRecorder(fileName);
             var db = new DomainStateDb(dbTempFile))
        {
            new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPluginTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPluginTest.java
@@ -0,0 +1,95 @@
+package nu.marginalia.converting.processor.plugin;
+
+import nu.marginalia.WmsaHome;
+import nu.marginalia.converting.processor.DocumentClass;
+import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
+import nu.marginalia.converting.processor.logic.TitleExtractor;
+import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
+import nu.marginalia.converting.processor.summary.SummaryExtractor;
+import nu.marginalia.converting.processor.summary.heuristic.*;
+import nu.marginalia.keyword.DocumentKeywordExtractor;
+import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.language.filter.LanguageFilter;
+import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.crawldata.CrawledDocument;
+import nu.marginalia.term_frequency_dict.TermFrequencyDict;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.Instant;
+
+@Tag("flaky")
+class PdfDocumentProcessorPluginTest {
+    static PdfDocumentProcessorPlugin plugin;
+
+    @BeforeAll
+    static void setUpBeforeClass() throws Exception {
+        var lm = WmsaHome.getLanguageModels();
+        plugin = new PdfDocumentProcessorPlugin(255,
+                new LanguageFilter(lm),
+                new ThreadLocalSentenceExtractorProvider(lm),
+                new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
+                new DocumentLengthLogic(100),
+                new DefaultSpecialization(new SummaryExtractor(
+                        255,
+                        new DomFilterHeuristic(255),
+                        new TagDensityHeuristic(255),
+                        new OpenGraphDescriptionHeuristic(),
+                        new MetaDescriptionHeuristic(),
+                        new FallbackHeuristic()
+                ),
+                        new TitleExtractor(255)
+                        ));
+    }
+    public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
+        var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
+        return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
+    }
+
+    public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(Path file) throws Exception {
+        return testPdfFile(Files.readAllBytes(file));
+    }
+
+    private byte[] downloadPDF(String url) throws IOException, URISyntaxException {
+        HttpURLConnection conn = (HttpURLConnection) new URI(url).toURL().openConnection();
+        try {
+            return conn.getInputStream().readAllBytes();
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        } finally {
+            conn.disconnect();
+        }
+    }
+
+
+    @Disabled
+    @Test
+    void testingTool() throws Exception {
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample2.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample3.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample4.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample5.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample6.pdf")).details().title);
+    }
+
+    @Disabled
+    @Test
+    void testingTool2() throws Exception {
+        System.out.println(plugin.convertPdfToHtml(Files.readAllBytes(Path.of("/home/st_work/Work/sample6.pdf"))));
+    }
+
+    @Test
+    void testMarginaliaSample() throws Exception {
+        var doc = plugin.convertPdfToHtml(downloadPDF("https://www.marginalia.nu/junk/test.pdf"));
+        System.out.println(doc.html());
+    }
+}
--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java
@@ -3,8 +3,8 @@ package nu.marginalia.converting.processor.pubdate;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.Jsoup;
 import org.junit.jupiter.api.Test;

@@ -74,7 +74,7 @@ class PubDateSnifferTest {
                        <time pubdate="pubdate" datetime="2022-08-24">time</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -90,7 +90,7 @@ class PubDateSnifferTest {
                        <time>2022-08-24</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -106,7 +106,7 @@ class PubDateSnifferTest {
                        <time class="published" datetime="July 13, 2006">July 13, 2006</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals(2006, ret.year());
@@ -116,14 +116,14 @@ class PubDateSnifferTest {
    public void testProblemCases() throws IOException, URISyntaxException {
        var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
                new EdgeUrl("https://www.example.com/"),
-                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
+                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), DocumentFormat.HTML5, true);

        assertFalse(ret.isEmpty());
        assertEquals(2006, ret.year());

        ret = dateSniffer.getPubDate(new DocumentHeaders(""),
                new EdgeUrl("https://www.example.com/"),
-                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
+                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), DocumentFormat.XHTML, true);

        assertFalse(ret.isEmpty());
        assertEquals(2010, ret.year());
@@ -146,7 +146,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <meta itemprop="datePublished" content="2022-08-24" />
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -160,7 +160,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <meta property="datePublished" content="2022-08-24" />
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -174,7 +174,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2004-08-24", ret.dateIso8601());
@@ -188,7 +188,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2016-12-27", ret.dateIso8601());
@@ -202,7 +202,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <title>No date in the HTML</title>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
@@ -217,7 +217,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <title>No date in the HTML</title>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-02-03", ret.dateIso8601());
@@ -232,7 +232,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <p>Published 2003, updated 2022</p>
-                        """), HtmlStandard.HTML5, true);
+                        """), DocumentFormat.HTML5, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
@@ -258,7 +258,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <div style="float: left;">&nbsp;<b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a  href="./viewtopic.php?p=34580&amp;sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span>&nbsp;<b>Posted:</b> Sun Oct 03, 2010 5:37 pm&nbsp;</div>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
--- a/code/processes/crawling-process/build.gradle
+++ b/code/processes/crawling-process/build.gradle
@@ -67,8 +67,6 @@ dependencies {
    testImplementation libs.mockito
    testImplementation libs.wiremock

-
-
    testImplementation project(':code:processes:test-data')
 }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
 import java.security.Security;
 import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;

@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {

    private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();

+    private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
+
    private final AtomicInteger tasksDone = new AtomicInteger(0);
    private final HttpFetcherImpl fetcher;

@@ -261,28 +264,44 @@ public class CrawlerMain extends ProcessMainClass {
                if (workLog.isJobFinished(crawlSpec.domain))
                    continue;

-                var task = new CrawlTask(
-                        crawlSpec,
-                        anchorTagsSource,
-                        outputDir,
-                        warcArchiver,
-                        domainStateDb,
-                        workLog);
+                var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);

                // Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
                if (!trySubmitDeferredTask(task)) {
-                    // Otherwise add to the taskList for deferred execution
+
+                    // Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
+                    retryQueue.drainTo(taskList);
+                    taskList.removeIf(this::trySubmitDeferredTask);
+
+                    // Then add this new task to the retry queue
                    taskList.add(task);
                }
            }

             // Schedule viable tasks for execution until list is empty
-            while (!taskList.isEmpty()) {
-                taskList.removeIf(this::trySubmitDeferredTask);
+            for (int emptyRuns = 0;emptyRuns < 300;) {
+                boolean hasTasks = !taskList.isEmpty();

-                // Add a small pause here to avoid busy looping toward the end of the execution cycle when
-                // we might have no new viable tasks to run for hours on end
-                TimeUnit.MILLISECONDS.sleep(50);
+                // The order of these checks  very important to avoid a race condition
+                // where we miss a task that is put into the retry queue
+                boolean hasRunningTasks = pool.getActiveCount() > 0;
+                boolean hasRetryTasks = !retryQueue.isEmpty();
+
+                if (hasTasks || hasRetryTasks || hasRunningTasks) {
+                    retryQueue.drainTo(taskList);
+
+                    // Try to submit any tasks that are in the retry queue (this will block if the pool is full)
+                    taskList.removeIf(this::trySubmitDeferredTask);
+
+                    // Add a small pause here to avoid busy looping toward the end of the execution cycle when
+                    // we might have no new viable tasks to run for hours on end
+                    TimeUnit.MILLISECONDS.sleep(5);
+                } else {
+                    // We have no tasks to run, and no tasks in the retry queue
+                    // but we wait a bit to see if any new tasks come in via the retry queue
+                    emptyRuns++;
+                    TimeUnit.SECONDS.sleep(1);
+                }
            }

            logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -414,7 +433,7 @@ public class CrawlerMain extends ProcessMainClass {
        /** Best effort indicator whether we could start this now without getting stuck in
         * DomainLocks purgatory */
        public boolean canRun() {
-            return domainLocks.canLock(new EdgeDomain(domain));
+            return domainLocks.isLockableHint(new EdgeDomain(domain));
        }

        @Override
@@ -425,66 +444,76 @@ public class CrawlerMain extends ProcessMainClass {
                return;
            }

-            Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
-            Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
-            Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
-
-            // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
-            // while writing to the same file name as before
-            if (Files.exists(newWarcFile)) {
-                Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
-            }
-            else {
-                Files.deleteIfExists(tempFile);
-            }
-
-            try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
-                 var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
-                 CrawlDataReference reference = getReference()
-            )
-            {
-                // Resume the crawl if it was aborted
-                if (Files.exists(tempFile)) {
-                    retriever.syncAbortedRun(tempFile);
-                    Files.delete(tempFile);
-                }
-
-                DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
-
-                int size;
-                try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
-                    size = retriever.crawlDomain(domainLinks, reference);
-                }
-
-                // Delete the reference crawl data if it's not the same as the new one
-                // (mostly a case when migrating from legacy->warc)
-                reference.delete();
-
-                // Convert the WARC file to Parquet
-                SlopCrawlDataRecord
-                        .convertWarc(domain, userAgent, newWarcFile, slopFile);
-
-                // Optionally archive the WARC file if full retention is enabled,
-                // otherwise delete it:
-                warcArchiver.consumeWarc(newWarcFile, domain);
-
-                // Mark the domain as finished in the work log
-                workLog.setJobToFinished(domain, slopFile.toString(), size);
-
-                // Update the progress bar
-                heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
-
-                logger.info("Fetched {}", domain);
-            } catch (Exception e) {
-                logger.error("Error fetching domain " + domain, e);
-            }
-            finally {
-                // We don't need to double-count these; it's also kept in the workLog
+            Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
+            // We don't have a lock, so we can't run this task
+            // we return to avoid blocking the pool for too long
+            if (lock.isEmpty()) {
                pendingCrawlTasks.remove(domain);
-                Thread.currentThread().setName("[idle]");
+                retryQueue.put(this);
+                return;
+            }
+            DomainLocks.DomainLock domainLock = lock.get();

-                Files.deleteIfExists(newWarcFile);
-                Files.deleteIfExists(tempFile);
+            try (domainLock) {
+                Thread.currentThread().setName("crawling:" + domain);
+
+                Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
+                Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
+                Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
+
+                // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
+                // while writing to the same file name as before
+                if (Files.exists(newWarcFile)) {
+                    Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
+                }
+                else {
+                    Files.deleteIfExists(tempFile);
+                }
+
+                try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
+                     var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
+                     CrawlDataReference reference = getReference())
+                {
+                    // Resume the crawl if it was aborted
+                    if (Files.exists(tempFile)) {
+                        retriever.syncAbortedRun(tempFile);
+                        Files.delete(tempFile);
+                    }
+
+                    DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
+
+                    int size = retriever.crawlDomain(domainLinks, reference);
+
+                    // Delete the reference crawl data if it's not the same as the new one
+                    // (mostly a case when migrating from legacy->warc)
+                    reference.delete();
+
+                    // Convert the WARC file to Slop
+                    SlopCrawlDataRecord
+                            .convertWarc(domain, userAgent, newWarcFile, slopFile);
+
+                    // Optionally archive the WARC file if full retention is enabled,
+                    // otherwise delete it:
+                    warcArchiver.consumeWarc(newWarcFile, domain);
+
+                    // Mark the domain as finished in the work log
+                    workLog.setJobToFinished(domain, slopFile.toString(), size);
+
+                    // Update the progress bar
+                    heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
+
+                    logger.info("Fetched {}", domain);
+                } catch (Exception e) {
+                    logger.error("Error fetching domain " + domain, e);
+                }
+                finally {
+                    // We don't need to double-count these; it's also kept in the workLog
+                    pendingCrawlTasks.remove(domain);
+                    Thread.currentThread().setName("[idle]");
+
+                    Files.deleteIfExists(newWarcFile);
+                    Files.deleteIfExists(tempFile);
+                }
            }
        }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
@@ -1,6 +1,6 @@
 package nu.marginalia.crawl.fetcher;

-import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
+import org.apache.hc.client5.http.classic.methods.HttpGet;

 /** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
 public record ContentTags(String etag, String lastMod) {
@@ -17,14 +17,16 @@ public record ContentTags(String etag, String lastMod) {
    }

    /** Paints the tags onto the request builder. */
-    public void paint(ClassicRequestBuilder getBuilder) {
+    public void paint(HttpGet request) {
+
+        // Paint the ETag header if present,
+        // otherwise paint the Last-Modified header
+        // (but not both at the same time due to some servers not liking it)

        if (etag != null) {
-            getBuilder.addHeader("If-None-Match", etag);
-        }
-
-        if (lastMod != null) {
-            getBuilder.addHeader("If-Modified-Since", lastMod);
+            request.addHeader("If-None-Match", etag);
+        } else if (lastMod != null) {
+            request.addHeader("If-Modified-Since", lastMod);
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java
@@ -1,34 +0,0 @@
-package nu.marginalia.crawl.fetcher;
-
-import java.io.IOException;
-import java.net.CookieHandler;
-import java.net.URI;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-
-public class Cookies extends CookieHandler {
-    final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
-
-    public void clear() {
-        cookieJar.get().clear();
-    }
-
-    public boolean hasCookies() {
-        return !cookieJar.get().isEmpty();
-    }
-
-    public List<String> getCookies() {
-        return cookieJar.get().values().stream().flatMap(List::stream).toList();
-    }
-
-    @Override
-    public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
-        return cookieJar.get();
-    }
-
-    @Override
-    public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
-        cookieJar.get().putAll(responseHeaders);
-    }
-}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/DomainCookies.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/DomainCookies.java
@@ -0,0 +1,56 @@
+package nu.marginalia.crawl.fetcher;
+
+import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
+import org.apache.hc.core5.http.ClassicHttpRequest;
+import org.apache.hc.core5.http.HttpResponse;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.StringJoiner;
+
+public class DomainCookies {
+    private final Map<String, String> cookies = new HashMap<>();
+
+    public boolean hasCookies() {
+        return !cookies.isEmpty();
+    }
+
+    public void updateCookieStore(HttpResponse response) {
+        for (var header : response.getHeaders()) {
+            if (header.getName().equalsIgnoreCase("Set-Cookie")) {
+                parseCookieHeader(header.getValue());
+            }
+        }
+    }
+
+    private void parseCookieHeader(String value) {
+        // Parse the Set-Cookie header value and extract the cookies
+
+        String[] parts = value.split(";");
+        String cookie = parts[0].trim();
+
+        if (cookie.contains("=")) {
+            String[] cookieParts = cookie.split("=");
+            String name = cookieParts[0].trim();
+            String val = cookieParts[1].trim();
+            cookies.put(name, val);
+        }
+    }
+
+    public void paintRequest(HttpUriRequestBase request) {
+        request.addHeader("Cookie", createCookieHeader());
+    }
+
+    public void paintRequest(ClassicHttpRequest request) {
+        request.addHeader("Cookie", createCookieHeader());
+    }
+
+    private String createCookieHeader() {
+        StringJoiner sj = new StringJoiner("; ");
+        for (var cookie : cookies.entrySet()) {
+            sj.add(cookie.getKey() + "=" + cookie.getValue());
+        }
+        return sj.toString();
+    }
+
+}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java
@@ -23,6 +23,7 @@ public interface HttpFetcher extends AutoCloseable {

    HttpFetchResult fetchContent(EdgeUrl url,
                                 WarcRecorder recorder,
+                                 DomainCookies cookies,
                                 CrawlDelayTimer timer,
                                 ContentTags tags,
                                 ProbeType probeType);
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
 import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
 import org.apache.hc.client5.http.HttpRequestRetryStrategy;
 import org.apache.hc.client5.http.classic.HttpClient;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.config.ConnectionConfig;
 import org.apache.hc.client5.http.config.RequestConfig;
 import org.apache.hc.client5.http.cookie.BasicCookieStore;
@@ -34,6 +35,7 @@ import org.apache.hc.core5.http.io.entity.EntityUtils;
 import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.apache.hc.core5.http.message.MessageSupport;
 import org.apache.hc.core5.http.protocol.HttpContext;
+import org.apache.hc.core5.pool.PoolStats;
 import org.apache.hc.core5.util.TimeValue;
 import org.apache.hc.core5.util.Timeout;
 import org.jsoup.Jsoup;
@@ -45,11 +47,14 @@ import org.slf4j.Marker;
 import org.slf4j.MarkerFactory;

 import javax.net.ssl.SSLContext;
+import javax.net.ssl.SSLException;
 import java.io.IOException;
 import java.net.SocketTimeoutException;
 import java.net.URISyntaxException;
+import java.net.UnknownHostException;
 import java.security.NoSuchAlgorithmException;
 import java.time.Duration;
+import java.time.Instant;
 import java.util.*;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
@@ -76,14 +81,20 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
    }

    private final CloseableHttpClient client;
+    private PoolingHttpClientConnectionManager connectionManager;
+
+    public PoolStats getPoolStats() {
+        return connectionManager.getTotalStats();
+    }

    private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
        final ConnectionConfig connectionConfig = ConnectionConfig.custom()
                .setSocketTimeout(10, TimeUnit.SECONDS)
                .setConnectTimeout(30, TimeUnit.SECONDS)
+                .setValidateAfterInactivity(TimeValue.ofSeconds(5))
                .build();

-        final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
+        connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
                .setMaxConnPerRoute(2)
                .setMaxConnTotal(5000)
                .setDefaultConnectionConfig(connectionConfig)
@@ -91,11 +102,23 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
                .build();

        connectionManager.setDefaultSocketConfig(SocketConfig.custom()
-                .setSoLinger(TimeValue.ofSeconds(15))
+                .setSoLinger(TimeValue.ofSeconds(-1))
                .setSoTimeout(Timeout.ofSeconds(10))
                .build()
        );

+        Thread.ofPlatform().daemon(true).start(() -> {
+            try {
+                for (;;) {
+                    TimeUnit.SECONDS.sleep(15);
+                    logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
+                }
+            }
+            catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        });
+
        final RequestConfig defaultRequestConfig = RequestConfig.custom()
                .setCookieSpec(StandardCookieSpec.RELAXED)
                .setResponseTimeout(10, TimeUnit.SECONDS)
@@ -287,6 +310,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
     * recorded in the WARC file on failure.
     */
    public ContentTypeProbeResult probeContentType(EdgeUrl url,
+                                                   DomainCookies cookies,
                                                   CrawlDelayTimer timer,
                                                   ContentTags tags) {
        if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
@@ -299,9 +323,11 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
                    .addHeader("Accept-Encoding", "gzip")
                    .build();

-            var result = SendLock.wrapSend(client, head, (rsp) -> {
-                EntityUtils.consume(rsp.getEntity());
+            cookies.paintRequest(head);

+            return SendLock.wrapSend(client, head, (rsp) -> {
+                cookies.updateCookieStore(rsp);
+                EntityUtils.consume(rsp.getEntity());
                int statusCode = rsp.getCode();

                // Handle redirects
@@ -339,8 +365,6 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
                    return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
                }
            });
-
-            return result;
        }
        catch (SocketTimeoutException ex) {

@@ -362,6 +386,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
    @Override
    public HttpFetchResult fetchContent(EdgeUrl url,
                                           WarcRecorder warcRecorder,
+                                           DomainCookies cookies,
                                           CrawlDelayTimer timer,
                                           ContentTags contentTags,
                                           ProbeType probeType)
@@ -369,26 +394,32 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
        try {
            if (probeType == HttpFetcher.ProbeType.FULL) {
                try {
-                    var probeResult = probeContentType(url, timer, contentTags);
-                    logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
+                    var probeResult = probeContentType(url, cookies, timer, contentTags);
+
                    switch (probeResult) {
                        case HttpFetcher.ContentTypeProbeResult.NoOp():
                            break; //
                        case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
+                            logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
                            url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
                            break;
                        case ContentTypeProbeResult.BadContentType badContentType:
                            warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
+                            logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
                            return new HttpFetchResult.ResultNone();
                        case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
+                            logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
                            warcRecorder.flagAsTimeout(url);
                            return new HttpFetchResult.ResultException(ex);
                        case ContentTypeProbeResult.Exception(Exception ex):
+                            logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
                            warcRecorder.flagAsError(url, ex);
                            return new HttpFetchResult.ResultException(ex);
                        case ContentTypeProbeResult.HttpError httpError:
+                            logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
                            return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
                        case ContentTypeProbeResult.Redirect redirect:
+                            logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
                            return new HttpFetchResult.ResultRedirect(redirect.location());
                    }
                } catch (Exception ex) {
@@ -398,36 +429,41 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {

            }

-            ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI())
-                    .addHeader("User-Agent", userAgentString)
-                    .addHeader("Accept-Encoding", "gzip")
-                    .addHeader("Accept-Language", "en,*;q=0.5")
-                    .addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
+            HttpGet request = new HttpGet(url.asURI());
+            request.addHeader("User-Agent", userAgentString);
+            request.addHeader("Accept-Encoding", "gzip");
+            request.addHeader("Accept-Language", "en,*;q=0.5");
+            request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");

-            contentTags.paint(getBuilder);
+            contentTags.paint(request);

            try (var sl = new SendLock()) {
-                HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
+                Instant start = Instant.now();
+                HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
+
+                Duration fetchDuration = Duration.between(start, Instant.now());

                if (result instanceof HttpFetchResult.ResultOk ok) {
                    if (ok.statusCode() == 304) {
-                        return new HttpFetchResult.Result304Raw();
+                        result = new HttpFetchResult.Result304Raw();
                    }
                }

                switch (result) {
-                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
+                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
                    case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {}  for {}", redirect.url(), url);
-                    case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none  for {}", url);
-                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex());
+                    case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
+                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
                    case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
                    case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
                }
+
                return result;
            }
        }
        catch (Exception ex) {
-            ex.printStackTrace();
+            logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
+
            return new HttpFetchResult.ResultException(ex);
        }

@@ -494,56 +530,61 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
    }


-    private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
-        ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI())
-                .addHeader("User-Agent", userAgentString)
-                .addHeader("Accept-Encoding", "gzip")
-                .addHeader("Accept", "text/*, */*;q=0.9")
-                .addHeader("User-Agent", userAgentString)
-                .build();
+    private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
+        HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
+
+        getRequest.addHeader("User-Agent", userAgentString);
+        getRequest.addHeader("Accept-Encoding", "gzip");
+        getRequest.addHeader("Accept", "text/*, */*;q=0.9");
+        getRequest.addHeader("User-Agent", userAgentString);

        try (var sl = new SendLock()) {
            return client.execute(getRequest, response -> {
-                if (response.getCode() != 200) {
-                    return new SitemapResult.SitemapError();
+                try {
+                    if (response.getCode() != 200) {
+                        return new SitemapResult.SitemapError();
+                    }
+
+                    Document parsedSitemap = Jsoup.parse(
+                            EntityUtils.toString(response.getEntity()),
+                            sitemapUrl.toString(),
+                            Parser.xmlParser()
+                    );
+
+                    if (parsedSitemap.childrenSize() == 0) {
+                        return new SitemapResult.SitemapError();
+                    }
+
+                    String rootTagName = parsedSitemap.child(0).tagName();
+
+                    return switch (rootTagName.toLowerCase()) {
+                        case "sitemapindex" -> {
+                            List<String> references = new ArrayList<>();
+                            for (var locTag : parsedSitemap.getElementsByTag("loc")) {
+                                references.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
+                        }
+                        case "urlset" -> {
+                            List<String> urls = new ArrayList<>();
+                            for (var locTag : parsedSitemap.select("url > loc")) {
+                                urls.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
+                        }
+                        case "rss", "atom" -> {
+                            List<String> urls = new ArrayList<>();
+                            for (var locTag : parsedSitemap.select("link, url")) {
+                                urls.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
+                        }
+                        default -> new SitemapResult.SitemapError();
+                    };
                }
-
-                Document parsedSitemap = Jsoup.parse(
-                        EntityUtils.toString(response.getEntity()),
-                        sitemapUrl.toString(),
-                        Parser.xmlParser()
-                );
-
-                if (parsedSitemap.childrenSize() == 0) {
-                    return new SitemapResult.SitemapError();
+                finally {
+                    EntityUtils.consume(response.getEntity());
                }
-
-                String rootTagName = parsedSitemap.child(0).tagName();
-
-                return switch (rootTagName.toLowerCase()) {
-                    case "sitemapindex" -> {
-                        List<String> references = new ArrayList<>();
-                        for (var locTag : parsedSitemap.getElementsByTag("loc")) {
-                            references.add(locTag.text().trim());
-                        }
-                        yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
-                    }
-                    case "urlset" -> {
-                        List<String> urls = new ArrayList<>();
-                        for (var locTag : parsedSitemap.select("url > loc")) {
-                            urls.add(locTag.text().trim());
-                        }
-                        yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
-                    }
-                    case "rss", "atom" -> {
-                        List<String> urls = new ArrayList<>();
-                        for (var locTag : parsedSitemap.select("link, url")) {
-                            urls.add(locTag.text().trim());
-                        }
-                        yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
-                    }
-                    default -> new SitemapResult.SitemapError();
-                };
            });
        }
        catch (Exception ex) {
@@ -574,13 +615,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
    private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
        try (var sl = new SendLock()) {

-            ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI())
-                    .addHeader("User-Agent", userAgentString)
-                    .addHeader("Accept-Encoding", "gzip")
-                    .addHeader("Accept", "text/*, */*;q=0.9")
-                    .build();
+            HttpGet request = new HttpGet(url.asURI());
+            request.addHeader("User-Agent", userAgentString);
+            request.addHeader("Accept-Encoding", "gzip");
+            request.addHeader("Accept", "text/*, */*;q=0.9");

-            HttpFetchResult result = recorder.fetch(client, request);
+            HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);

            return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
                robotsParser.parseContent(url.toString(),
@@ -596,18 +636,19 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {

    @Override
    public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
-        if (exception instanceof SocketTimeoutException ex) {
-            return false;
-        }
-
-        return executionCount < 3;
+        return switch (exception) {
+            case SocketTimeoutException ste -> false;
+            case SSLException ssle -> false;
+            case UnknownHostException uhe -> false;
+            default -> executionCount <= 3;
+        };
    }

    @Override
    public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
        return switch (response.getCode()) {
-            case 500, 503 -> executionCount < 2;
-            case 429 -> executionCount < 3;
+            case 500, 503 -> executionCount <= 2;
+            case 429 -> executionCount <= 3;
            default -> false;
        };
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.BOMInputStream;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.core5.http.ClassicHttpResponse;
 import org.apache.hc.core5.http.Header;
 import org.netpreserve.jwarc.WarcTruncationReason;
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
     *  and suppressed from the headers.
     *  If an error occurs, a buffer will be created with no content and an error status.
     */
-    static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException {
+    static WarcInputBuffer forResponse(ClassicHttpResponse response,
+                                       HttpGet request,
+                                       Duration timeLimit) throws IOException {
        if (response == null)
            return new ErrorBuffer();

@@ -54,16 +57,47 @@ public abstract class WarcInputBuffer implements AutoCloseable {
            return new ErrorBuffer();
        }

-        InputStream is = entity.getContent();
-        long length = entity.getContentLength();
+        Instant start = Instant.now();
+        InputStream is = null;
+        try {
+            is = entity.getContent();
+            long length = entity.getContentLength();

-        try (response) {
            if (length > 0 && length < 8192) {
                // If the content is small and not compressed, we can just read it into memory
-                return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length);
+                return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
            } else {
                // Otherwise, we unpack it into a file and read it from there
-                return new FileBuffer(response.getHeaders(), timeLimit, is);
+                return new FileBuffer(response.getHeaders(), request, timeLimit, is);
+            }
+        }
+        finally {
+            // We're required to consume the stream to avoid leaking connections,
+            // but we also don't want to get stuck on slow or malicious connections
+            // forever, so we set a time limit on this phase and call abort() if it's exceeded.
+            try {
+                while (is != null) {
+                    // Consume some data
+                    if (is.skip(65536) == 0) {
+                        // Note that skip may return 0 if the stream is empty
+                        // or for other unspecified reasons, so we need to check
+                        // with read() as well to determine if the stream is done
+                        if (is.read() == -1)
+                            is = null;
+                    }
+                    // Check if the time limit has been exceeded
+                    else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
+                        request.abort();
+                        is = null;
+                    }
+                }
+            }
+            catch (IOException e) {
+                // Ignore the exception
+            }
+            finally {
+                // Close the input stream
+                IOUtils.closeQuietly(is);
            }
        }

@@ -71,7 +105,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
    }

    /** Copy an input stream to an output stream, with a maximum size and time limit */
-    protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
+    protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
        Instant start = Instant.now();
        Instant timeout = start.plus(timeLimit);
        long size = 0;
@@ -86,6 +120,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
                Duration remaining = Duration.between(Instant.now(), timeout);
                if (remaining.isNegative()) {
                    truncationReason = WarcTruncationReason.TIME;
+                    // Abort the request if the time limit is exceeded
+                    // so we don't keep the connection open forever or are forced to consume
+                    // the stream to the end
+
+                    request.abort();
                    break;
                }

@@ -104,6 +143,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
                }
                else if (truncationReason != WarcTruncationReason.LENGTH) {
                    truncationReason = WarcTruncationReason.LENGTH;
+                    break;
                }

            } catch (IOException e) {
@@ -111,13 +151,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
            }
        }

-        // Try to close the connection as long as we haven't timed out.
-        // As per Apache HttpClient's semantics, this will reset the connection
-        // and close the stream if we have timed out.
-
-        if (truncationReason != WarcTruncationReason.TIME) {
-            IOUtils.closeQuietly(is);
-        }
    }

    /** Takes a Content-Range header and checks if it is complete.
@@ -218,7 +251,7 @@ class ErrorBuffer extends WarcInputBuffer {
 /** Buffer for when we have the response in memory */
 class MemoryBuffer extends WarcInputBuffer {
    byte[] data;
-    public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) {
+    public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
        super(suppressContentEncoding(headers));

        if (!isRangeComplete(headers)) {
@@ -229,7 +262,7 @@ class MemoryBuffer extends WarcInputBuffer {

        var outputStream = new ByteArrayOutputStream(size);

-        copy(responseStream, outputStream, timeLimit);
+        copy(responseStream, request, outputStream, timeLimit);

        data = outputStream.toByteArray();
    }
@@ -253,7 +286,7 @@ class MemoryBuffer extends WarcInputBuffer {
 class FileBuffer extends WarcInputBuffer {
    private final Path tempFile;

-    public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException {
+    public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
        super(suppressContentEncoding(headers));

        if (!isRangeComplete(headers)) {
@@ -265,7 +298,7 @@ class FileBuffer extends WarcInputBuffer {
        this.tempFile = Files.createTempFile("rsp", ".html");

        try (var out = Files.newOutputStream(tempFile)) {
-            copy(responseStream, out, timeLimit);
+            copy(responseStream, request, out, timeLimit);
        }
        catch (Exception ex) {
            truncationReason = WarcTruncationReason.UNSPECIFIED;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java
@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
 import java.net.http.HttpHeaders;
 import java.net.http.HttpResponse;
 import java.nio.charset.StandardCharsets;
+import java.time.Duration;
 import java.util.*;
 import java.util.stream.Collectors;

@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
        return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
    }

-    static String getResponseHeader(ClassicHttpResponse response, long size) {
-        String headerString = getHeadersAsString(response.getHeaders(), size);
+    static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
+        String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);

        return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
    }
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {



-    static private String getHeadersAsString(Header[] headers, long responseSize) {
+    static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
        StringJoiner joiner = new StringJoiner("\r\n");

        for (var header : headers) {
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
            if (headerCapitalized.equals("Content-Encoding"))
                continue;

+
            // Since we're transparently decoding gzip, we need to update the Content-Length header
            // to reflect the actual size of the response body. We'll do this at the end.
            if (headerCapitalized.equals("Content-Length"))
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
            joiner.add(headerCapitalized + ": " + header.getValue());
        }

+        joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
        joiner.add("Content-Length: " + responseSize);

        return joiner.toString();
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@@ -1,6 +1,7 @@
 package nu.marginalia.crawl.fetcher.warc;

 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.link_parser.LinkParser;
@@ -8,9 +9,7 @@ import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
-import org.apache.hc.client5.http.cookie.CookieStore;
-import org.apache.hc.core5.http.ClassicHttpRequest;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.core5.http.NameValuePair;
 import org.jetbrains.annotations.Nullable;
 import org.netpreserve.jwarc.*;
@@ -42,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
    static final int MAX_TIME = 30_000;

    /** Maximum (decompressed) size we'll save */
-    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
+    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);

    private final WarcWriter writer;
    private final Path warcFile;
@@ -53,23 +52,15 @@ public class WarcRecorder implements AutoCloseable {
    // Affix a version string in case we need to change the format in the future
    // in some way
    private final String warcRecorderVersion = "1.0";
-    private final CookieStore cookies;
    private final LinkParser linkParser = new LinkParser();
    /**
     * Create a new WarcRecorder that will write to the given file
     *
     * @param warcFile The file to write to
     */
-    public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
+    public WarcRecorder(Path warcFile) throws IOException {
        this.warcFile = warcFile;
        this.writer = new WarcWriter(warcFile);
-        this.cookies = fetcher.getCookies();
-    }
-
-    public WarcRecorder(Path warcFile, CookieStore cookies) throws IOException {
-        this.warcFile = warcFile;
-        this.writer = new WarcWriter(warcFile);
-        this.cookies = cookies;
    }

    /**
@@ -79,24 +70,21 @@ public class WarcRecorder implements AutoCloseable {
    public WarcRecorder() throws IOException {
        this.warcFile = Files.createTempFile("warc", ".warc.gz");
        this.writer = new WarcWriter(this.warcFile);
-        this.cookies = new BasicCookieStore();

        temporaryFile = true;
    }

-    private boolean hasCookies() {
-        return !cookies.getCookies().isEmpty();
-    }
-
    public HttpFetchResult fetch(HttpClient client,
-                                 ClassicHttpRequest request)
+                                 DomainCookies cookies,
+                                 HttpGet request)
            throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
    {
-        return fetch(client, request, Duration.ofMillis(MAX_TIME));
+        return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
    }

    public HttpFetchResult fetch(HttpClient client,
-                                 ClassicHttpRequest request,
+                                 DomainCookies cookies,
+                                 HttpGet request,
                                 Duration timeout)
            throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
    {
@@ -105,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
        WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
        WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();

-        Instant date = Instant.now();
+        Instant requestDate = Instant.now();

        // Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
        Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
@@ -113,13 +101,17 @@ public class WarcRecorder implements AutoCloseable {
        // Inject a range header to attempt to limit the size of the response
        // to the maximum size we want to store, if the server supports it.
        request.addHeader("Range", "bytes=0-"+MAX_SIZE);
-
+        cookies.paintRequest(request);
        try {
-            return client.execute(request, response -> {
+            return client.execute(request,response -> {

-                try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout);
+                try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
                     InputStream inputStream = inputBuffer.read()) {

+                    Instant responseDate = Instant.now();
+
+                    cookies.updateCookieStore(response);
+
                    // Build and write the request

                    WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
@@ -136,18 +128,21 @@ public class WarcRecorder implements AutoCloseable {

                    WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
                            .blockDigest(requestDigestBuilder.build())
-                            .date(date)
+                            .date(requestDate)
                            .body(MediaType.HTTP_REQUEST, httpRequestString)
                            .build();

                    warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
                    writer.write(warcRequest);

-                    if (hasCookies()) {
-                        extraHeaders.put("X-Has-Cookies", List.of("1"));
+
+                    if (cookies.hasCookies()) {
+                        response.addHeader("X-Has-Cookies", 1);
                    }

-                    byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
+                    byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
+                            Duration.between(requestDate, responseDate),
+                            inputBuffer.size()).getBytes(StandardCharsets.UTF_8);

                    ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);

@@ -178,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {

                    WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
                            .blockDigest(responseDigestBuilder.build())
-                            .date(date)
+                            .date(responseDate)
                            .concurrentTo(warcRequest.id())
                            .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());

@@ -193,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
                    warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
                    writer.write(warcResponse);

-                    if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
+                    if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
                            && inputBuffer.size() < 2048
                            && !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
                    {
@@ -205,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {

                        logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
                                requestUri,
-                                Duration.between(date, Instant.now()).getSeconds(),
+                                Duration.between(requestDate, Instant.now()).getSeconds(),
                                inputBuffer.size()
                        );

@@ -259,7 +254,7 @@ public class WarcRecorder implements AutoCloseable {
        writer.write(item);
    }

-    private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
+    private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
        try {
            WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
            WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
@@ -320,7 +315,7 @@ public class WarcRecorder implements AutoCloseable {
                    .date(Instant.now())
                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());

-            if (hasCookies()) {
+            if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
                builder.addHeader("X-Has-Cookies", "1");
            }

@@ -340,8 +335,8 @@ public class WarcRecorder implements AutoCloseable {
     * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified.  In this
     * scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
     */
-    public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
-        saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
+    public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
+        saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
    }

    public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
 import nu.marginalia.model.EdgeDomain;

 import java.util.Map;
+import java.util.Optional;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Semaphore;

@@ -19,8 +20,22 @@ public class DomainLocks {
     * and may be held by another thread.  The caller is responsible for locking and  releasing the lock.
     */
    public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
-        return new DomainLock(domain.toString(),
-                locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
+        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
+
+        sem.acquire();
+
+        return new DomainLock(sem);
+    }
+
+    public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
+        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
+        if (sem.tryAcquire(1)) {
+            return Optional.of(new DomainLock(sem));
+        }
+        else {
+            // We don't have a lock, so we return an empty optional
+            return Optional.empty();
+        }
    }

    private Semaphore defaultPermits(String topDomain) {
@@ -28,23 +43,27 @@ public class DomainLocks {
            return new Semaphore(16);
        if (topDomain.equals("blogspot.com"))
            return new Semaphore(8);
-
+        if (topDomain.equals("tumblr.com"))
+            return new Semaphore(8);
        if (topDomain.equals("neocities.org"))
-            return new Semaphore(4);
+            return new Semaphore(8);
        if (topDomain.equals("github.io"))
-            return new Semaphore(4);
+            return new Semaphore(8);

+        // Substack really dislikes broad-scale crawlers, so we need to be careful
+        // to not get blocked.
        if (topDomain.equals("substack.com")) {
            return new Semaphore(1);
        }
-        if (topDomain.endsWith(".edu")) {
-            return new Semaphore(1);
-        }

        return new Semaphore(2);
    }

-    public boolean canLock(EdgeDomain domain) {
+    /** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
+     * (this is just a hint, and does not guarantee that the domain is actually lockable any time
+     * after this method returns true)
+     */
+    public boolean isLockableHint(EdgeDomain domain) {
        Semaphore sem = locks.get(domain.topDomain.toLowerCase());
        if (null == sem)
            return true;
@@ -53,22 +72,16 @@ public class DomainLocks {
    }

    public static class DomainLock implements AutoCloseable {
-        private final String domainName;
        private final Semaphore semaphore;

-        DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
-            this.domainName = domainName;
+        DomainLock(Semaphore semaphore) {
            this.semaphore = semaphore;
-
-            Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
-            semaphore.acquire();
-            Thread.currentThread().setName("crawling:" + domainName);
        }

        @Override
        public void close() throws Exception {
            semaphore.release();
-            Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
+            Thread.currentThread().setName("[idle]");
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -6,6 +6,7 @@ import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.logic.LinkFilterSelector;
@@ -51,6 +52,7 @@ public class CrawlerRetreiver implements AutoCloseable {
    private final DomainStateDb domainStateDb;
    private final WarcRecorder warcRecorder;
    private final CrawlerRevisitor crawlerRevisitor;
+    private final DomainCookies cookies = new DomainCookies();

    private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
            Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
@@ -124,7 +126,7 @@ public class CrawlerRetreiver implements AutoCloseable {
                    }

                    Instant recrawlStart = Instant.now();
-                    CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
+                    CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
                    Duration recrawlTime = Duration.between(recrawlStart, Instant.now());

                    // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
@@ -274,7 +276,7 @@ public class CrawlerRetreiver implements AutoCloseable {
        try {
            var url = rootUrl.withPathAndParam("/", null);

-            HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+            HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
            timer.waitFetchDelay(0);

            if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
@@ -337,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {

            // Grab the favicon if it exists

-            if (fetcher.fetchContent(faviconUrl, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
+            if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
                String contentType = iconResult.header("Content-Type");
                byte[] iconData = iconResult.getBodyBytes();

@@ -407,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
        if (parsedOpt.isEmpty())
            return false;

-        HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
        timer.waitFetchDelay(0);

        if (!(result instanceof HttpFetchResult.ResultOk ok)) {
@@ -435,7 +437,7 @@ public class CrawlerRetreiver implements AutoCloseable {
    {
        var contentTags = reference.getContentTags();

-        HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, timer, contentTags, HttpFetcher.ProbeType.FULL);
+        HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
        timer.waitFetchDelay();

        if (Thread.interrupted()) {
@@ -461,7 +463,7 @@ public class CrawlerRetreiver implements AutoCloseable {
                {
                    var doc = reference.doc();

-                    warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
+                    warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);

                    fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
                            new ContentType(doc.contentType, "UTF-8"),
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;

 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@@ -37,6 +38,7 @@ public class CrawlerRevisitor {

    /** Performs a re-crawl of old documents, comparing etags and last-modified */
    public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
+                       DomainCookies cookies,
                       SimpleRobotRules robotsRules,
                       CrawlDelayTimer delayTimer)
    throws InterruptedException {
@@ -72,7 +74,7 @@ public class CrawlerRevisitor {

            // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
            // unlikely to produce anything meaningful for us.
-            if (doc.httpStatus != 200)
+            if (doc.httpStatus != 200 && doc.httpStatus != 206)
                continue;
            if (!doc.hasBody())
                continue;
@@ -132,6 +134,7 @@ public class CrawlerRevisitor {
                }
                // Add a WARC record so we don't repeat this
                warcRecorder.writeReferenceCopy(url,
+                        cookies,
                        doc.contentType,
                        doc.httpStatus,
                        doc.documentBodyBytes,
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
@@ -58,7 +58,7 @@ public record DocumentWithReference(
        if (null == doc)
            return ContentTags.empty();

-        if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
+        if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
            return ContentTags.empty();

        String lastmod = doc.getLastModified();
--- a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -1,22 +1,32 @@
 package nu.marginalia;

+import org.apache.commons.lang3.StringUtils;
+
 import java.util.Set;

 public class ContentTypes {
    public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
            "application/xhtml",
            "text/html",
+            "text/markdown",
+            "text/x-markdown",
+            "application/pdf",
            "image/x-icon",
            "text/plain");

    public static boolean isAccepted(String contentTypeHeader) {
-        String lcHeader = contentTypeHeader.toLowerCase();
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
        for (var type : acceptedContentTypes) {
-            if (lcHeader.startsWith(type)) {
+            if (lcHeader.equals(type)) {
                return true;
            }
        }
        return false;
    }

+    public static boolean isBinary(String contentTypeHeader) {
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
+        return lcHeader.startsWith("application/pdf");
+    }
+
 }
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java
@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
                nextRecord.body,
                // this field isn't actually used, maybe we can skip calculating it?
                nextRecord.cookies,
+                -1,
                lastModified,
                etag));
    }
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
            public boolean filter(String url, int status, String contentType) {
                String ctLc = contentType.toLowerCase();

+                // Permit all plain text content types
                if (ctLc.startsWith("text/"))
                    return true;
+                // PDF
+                else if (ctLc.startsWith("application/pdf"))
+                    return true;
                else if (ctLc.startsWith("x-marginalia/"))
                    return true;

@@ -162,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
                nextRecord.body(),
                // this field isn't actually used, maybe we can skip calculating it?
                nextRecord.cookies(),
+                nextRecord.requestTimeMs(),
                null,
                null));
    }
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;

 public class ContentTypeLogic {

-    private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
+    private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
    private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
    private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
    private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
            "application/rss+xml",
            "application/x-rss+xml",
            "application/rdf+xml",
+            "application/pdf",
            "x-rss+xml"
    );
    private boolean allowAllContentTypes = false;
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
    public boolean isUrlLikeBinary(EdgeUrl url) {
        String pathLowerCase = url.path.toLowerCase();

-        if (probableHtmlPattern.test(pathLowerCase))
+        if (probableGoodPattern.test(pathLowerCase))
            return false;

        return probableBinaryPattern.test(pathLowerCase);
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {

    public String crawlerStatus;
    public String crawlerStatusDesc;
+    public int requestTimeMs;

    @Nullable
    public String headers;
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
    public String lastModifiedMaybe;
    public String etagMaybe;

-    public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
+    public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
        this.crawlId = crawlId;
        this.url = url;
        this.contentType = contentType;
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
        this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
        this.hasCookies = hasCookies;
        this.lastModifiedMaybe = lastModifiedMaybe;
+        this.requestTimeMs = requestTimeMs;
        this.etagMaybe = etagMaybe;
    }

@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
        private byte[] documentBodyBytes = new byte[0];
        private String recrawlState;
        private Boolean hasCookies;
+        private int requestTimeMs;
        private String lastModifiedMaybe;
        private String etagMaybe;

@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
            return this;
        }

+        public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
+            this.requestTimeMs = requestTimeMs;
+            return this;
+        }
+
        public CrawledDocument build() {
-            return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
+            return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
        }

        public String toString() {
--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
 import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
 import nu.marginalia.slop.column.array.ByteArrayColumn;
 import nu.marginalia.slop.column.primitive.ByteColumn;
+import nu.marginalia.slop.column.primitive.IntColumn;
 import nu.marginalia.slop.column.primitive.LongColumn;
 import nu.marginalia.slop.column.primitive.ShortColumn;
 import nu.marginalia.slop.column.string.EnumColumn;
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
                                  long timestamp,
                                  String contentType,
                                  byte[] body,
+                                  int requestTimeMs,
                                  String headers)
 {
    private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
    private static final LongColumn timestampColumn = new LongColumn("timestamp");
    private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
    private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
+    private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
    private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);

    public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
                parquetRecord.timestamp.toEpochMilli(),
                parquetRecord.contentType,
                parquetRecord.body,
+                -1,
                parquetRecord.headers
                );
    }
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
                date.toEpochMilli(),
                "x-marginalia/advisory;state=redirect",
                new byte[0],
+                -1,
                ""
        );
    }
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
                date.toEpochMilli(),
                "x-marginalia/advisory;state=error",
                errorStatus.getBytes(),
+                -1,
                ""
        );
    }
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
                date.toEpochMilli(),
                errorStatus,
                new byte[0],
+                -1,
                ""
        );
    }
@@ -158,11 +165,12 @@ public record SlopCrawlDataRecord(String domain,
                        // and is used to store old responses from previous crawls; in this part of the logic
                        // we treat them the same as a normal response

-                        if (!filterResponse(uaString, response)) {
+                        var filterStatus = filterResponse(uaString, response);
+                        if (filterStatus.isRejected()) {
                            continue;
                        }

-                        slopWriter.write(domain, response);
+                        slopWriter.write(domain, filterStatus, response);
                    } else if (record instanceof WarcXEntityRefused refused) {
                        slopWriter.write(domain, refused);
                    } else if (record instanceof Warcinfo warcinfo) {
@@ -187,25 +195,35 @@ public record SlopCrawlDataRecord(String domain,
        }
    }

-
+    sealed interface ResponseFilterResult {
+        default boolean isRejected() { return false; }
+        record Accept() implements ResponseFilterResult {}
+        record AcceptWithContentType(String contentType) implements ResponseFilterResult {}
+        record AcceptIfPlainText(String contentType) implements ResponseFilterResult {}
+        record Reject() implements ResponseFilterResult {
+            @Override
+            public boolean isRejected() { return true; }
+        }
+    }

    /** Return true if the WarcResponse should be excluded from conversion */
-    private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
+    private static ResponseFilterResult filterResponse(String uaString, WarcResponse response) throws IOException {

        // We don't want to store robots.txt files, as they are not
        // interesting for the analysis we want to do.  This is important
        // since txt-files in general are interesting, and we don't want to
        // exclude them as a class.

-        if (response.targetURI().getPath().equals("/robots.txt")) {
-            return false;
+        String uriPath = response.targetURI().getPath();
+        if (uriPath.equals("/robots.txt")) {
+            return new ResponseFilterResult.Reject();
        }

        var headers = response.http().headers();
        var robotsTags = headers.all("X-Robots-Tag");

        if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
-            return false;
+            return new ResponseFilterResult.Reject();
        }

        // Strip out responses with content types we aren't interested in
@@ -213,10 +231,29 @@ public record SlopCrawlDataRecord(String domain,
        String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();

        if (!ContentTypes.isAccepted(contentType)) {
-            return false;
+            String contentTypeWithoutParams = StringUtils.substringBefore(contentType, ";");
+
+            // Some servers don't understand what a markdown file is
+            if (contentTypeWithoutParams.equals("application/octet-stream")) {
+                if (uriPath.endsWith(".md")) {
+                    // This is a markdown file, which we want to keep
+                    return new ResponseFilterResult.AcceptIfPlainText("text/markdown");
+                }
+                else if (uriPath.endsWith(".pdf")) {
+                    // This is a text file, which we want to keep
+                    return new ResponseFilterResult.AcceptWithContentType("application/pdf");
+                }
+            }
+
+            return new ResponseFilterResult.Reject();
        }

-        return true;
+        // If the format is binary, we don't want to translate it if the response is truncated
+        if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
+            return new ResponseFilterResult.Reject();
+        }
+
+        return new ResponseFilterResult.Accept();
    }

    /**  Check X-Robots-Tag header tag to see if we are allowed to index this page.
@@ -272,7 +309,8 @@ public record SlopCrawlDataRecord(String domain,
        try (var table = new SlopTable(path)) {
            ShortColumn.Reader statusReader = statusColumn.open(table);
            while (statusReader.hasRemaining()) {
-                if (statusReader.get() == 200) {
+                int status = statusReader.get();
+                if (status == 200 || status == 206) {
                    cnt++;
                }
            }
@@ -290,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
        private final LongColumn.Writer timestampColumnWriter;
        private final EnumColumn.Writer contentTypeColumnWriter;
        private final ByteArrayColumn.Writer bodyColumnWriter;
+        private final ShortColumn.Writer requestTimeColumnWriter;
        private final StringColumn.Writer headerColumnWriter;

        public Writer(Path path) throws IOException {
@@ -303,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
            timestampColumnWriter = timestampColumn.create(this);
            contentTypeColumnWriter = contentTypeColumn.create(this);
            bodyColumnWriter = bodyColumn.create(this);
+            requestTimeColumnWriter = requestTimeColumn.create(this);
            headerColumnWriter = headerColumn.create(this);
        }

@@ -315,10 +355,11 @@ public record SlopCrawlDataRecord(String domain,
            timestampColumnWriter.put(record.timestamp);
            contentTypeColumnWriter.put(record.contentType);
            bodyColumnWriter.put(record.body);
+            requestTimeColumnWriter.put((short) record.requestTimeMs);
            headerColumnWriter.put(record.headers);
        }

-        public void write(String domain, WarcResponse response) throws IOException {
+        public void write(String domain, ResponseFilterResult filterStatus, WarcResponse response) throws IOException {

            HttpFetchResult result = HttpFetchResult.importWarc(response);
            if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
@@ -341,14 +382,39 @@ public record SlopCrawlDataRecord(String domain,
                contentType = "";
            }

+            switch (filterStatus) {
+                case ResponseFilterResult.AcceptWithContentType(String ct) -> contentType = ct;
+                case ResponseFilterResult.AcceptIfPlainText(String ct) -> {
+                    try {
+                        // Parse the body as UTF-8
+                        new String(bodyBytes, StandardCharsets.UTF_8);
+                        contentType = ct;
+                    }
+                    catch (RuntimeException ex) { // UTF-8 decoding failed
+                        return;
+                    }
+                }
+                default -> {}
+            }
+
            boolean hasCookies = false;

            String headersStr;
            StringJoiner headersStrBuilder = new StringJoiner("\n");
+            int requestTimeMs = -1;
            for (var header : headers) {
                if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
                    hasCookies = true;
                }
+                if (header.getName().equals("X-Marginalia-Response-Time")) {
+                    try {
+                        requestTimeMs = Integer.parseInt(header.getValue());
+                    }
+                    catch (NumberFormatException ex) {
+                        logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
+                    }
+                    continue;
+                }
                headersStrBuilder.add(header.getName() + ": " + header.getValue());
            }
            headersStr = headersStrBuilder.toString();
@@ -363,6 +429,7 @@ public record SlopCrawlDataRecord(String domain,
                    response.date().toEpochMilli(),
                    contentType,
                    bodyBytes,
+                    requestTimeMs,
                    headersStr
                )
            );
@@ -415,6 +482,7 @@ public record SlopCrawlDataRecord(String domain,
        private final LongColumn.Reader timestampColumnReader;
        private final EnumColumn.Reader contentTypeColumnReader;
        private final ByteArrayColumn.Reader bodyColumnReader;
+        private final ShortColumn.Reader requestTimeColumnReader;
        private final StringColumn.Reader headerColumnReader;

        public Reader(Path path) throws IOException {
@@ -429,6 +497,17 @@ public record SlopCrawlDataRecord(String domain,
            contentTypeColumnReader = contentTypeColumn.open(this);
            bodyColumnReader = bodyColumn.open(this);
            headerColumnReader = headerColumn.open(this);
+
+            // FIXME: After 2025-06-XX, we can remove this migration workaround
+            ShortColumn.Reader timeColumnReader;
+            try {
+                timeColumnReader = requestTimeColumn.open(this);
+            }
+            catch (Exception ex) {
+                // Migration workaround
+                timeColumnReader = null;
+            }
+            requestTimeColumnReader = timeColumnReader;
        }

        public SlopCrawlDataRecord get() throws IOException {
@@ -441,6 +520,7 @@ public record SlopCrawlDataRecord(String domain,
                    timestampColumnReader.get(),
                    contentTypeColumnReader.get(),
                    bodyColumnReader.get(),
+                    requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1,
                    headerColumnReader.get()
            );
        }
@@ -460,6 +540,7 @@ public record SlopCrawlDataRecord(String domain,
        private final LongColumn.Reader timestampColumnReader;
        private final EnumColumn.Reader contentTypeColumnReader;
        private final ByteArrayColumn.Reader bodyColumnReader;
+        private final ShortColumn.Reader requestTimeColumnReader;
        private final StringColumn.Reader headerColumnReader;

        private SlopCrawlDataRecord next = null;
@@ -476,6 +557,17 @@ public record SlopCrawlDataRecord(String domain,
            contentTypeColumnReader = contentTypeColumn.open(this);
            bodyColumnReader = bodyColumn.open(this);
            headerColumnReader = headerColumn.open(this);
+
+            // FIXME: After 2025-06-XX, we can remove this migration workaround
+            ShortColumn.Reader timeColumnReader;
+            try {
+                timeColumnReader = requestTimeColumn.open(this);
+            }
+            catch (Exception ex) {
+                // Migration workaround
+                timeColumnReader = null;
+            }
+            requestTimeColumnReader = timeColumnReader;
        }

        public abstract boolean filter(String url, int status, String contentType);
@@ -502,6 +594,7 @@ public record SlopCrawlDataRecord(String domain,
                boolean cookies = cookiesColumnReader.get() == 1;
                int status = statusColumnReader.get();
                long timestamp = timestampColumnReader.get();
+                int requestTimeMs = requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1;
                String contentType = contentTypeColumnReader.get();

                LargeItem<byte[]> body = bodyColumnReader.getLarge();
@@ -509,7 +602,7 @@ public record SlopCrawlDataRecord(String domain,

                if (filter(url, status, contentType)) {
                    next = new SlopCrawlDataRecord(
-                            domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get()
+                            domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
                    );
                    return true;
                }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplContentTypeProbeTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplContentTypeProbeTest.java
@@ -11,6 +11,8 @@ import org.junit.jupiter.api.*;
 import java.io.IOException;
 import java.net.URISyntaxException;

+import static org.junit.jupiter.api.Assertions.assertEquals;
+
@Tag("slow")
 class HttpFetcherImplContentTypeProbeTest {

@@ -85,55 +87,59 @@ class HttpFetcherImplContentTypeProbeTest {

    @AfterEach
    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
        fetcher.close();
    }

    @Test
    public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
-        var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
-        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
+        var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
    }


    @Test
    public void testProbeContentTypeHtmlShortcircuitTags() {
-        var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
-        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
+        var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), new ContentTags("a", "b"));
+        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
    }

    @Test
    public void testProbeContentTypeHtml() {
-        var result = fetcher.probeContentType(contentTypeHtmlUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(contentTypeHtmlUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
    }

    @Test
    public void testProbeContentTypeBinary() {
-        var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
    }

    @Test
    public void testProbeContentTypeRedirect() {
-        var result = fetcher.probeContentType(redirectUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(redirectUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
    }

    @Test
    public void testProbeContentTypeBadHttpStatus() {
-        var result = fetcher.probeContentType(badHttpStatusUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(badHttpStatusUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
    }

    @Test
    public void testOnlyGetAllowed() {
-        var result = fetcher.probeContentType(onlyGetAllowedUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(onlyGetAllowedUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
    }

    @Test
    public void testTimeout() {
-        var result = fetcher.probeContentType(timeoutUrl, new CrawlDelayTimer(50), ContentTags.empty());
+        var result = fetcher.probeContentType(timeoutUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplDomainProbeTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplDomainProbeTest.java
@@ -12,6 +12,8 @@ import org.junit.jupiter.api.*;
 import java.io.IOException;
 import java.net.URISyntaxException;

+import static org.junit.jupiter.api.Assertions.assertEquals;
+
@Tag("slow")
 class HttpFetcherImplDomainProbeTest {

@@ -47,6 +49,10 @@ class HttpFetcherImplDomainProbeTest {

    @AfterEach
    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
        fetcher.close();
    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
@@ -31,6 +31,7 @@ class HttpFetcherImplFetchTest {
    private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";

    private static EdgeUrl okUrl;
+    private static EdgeUrl okUrlSetsCookie;
    private static EdgeUrl okRangeResponseUrl;
    private static EdgeUrl okUrlWith304;

@@ -39,6 +40,8 @@ class HttpFetcherImplFetchTest {
    private static EdgeUrl badHttpStatusUrl;
    private static EdgeUrl keepAliveUrl;

+    private static EdgeUrl pdfUrl;
+
    @BeforeAll
    public static void setupAll() throws URISyntaxException {
        wireMockServer =
@@ -88,6 +91,19 @@ class HttpFetcherImplFetchTest {
                        .withStatus(200)
                        .withBody("Hello World")));

+        okUrlSetsCookie = new EdgeUrl("http://localhost:18089/okSetCookie.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlSetsCookie.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withHeader("Set-Cookie", "test=1")
+                        .withStatus(200)));
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlSetsCookie.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withHeader("Set-Cookie", "test=1")
+                        .withStatus(200)
+                        .withBody("Hello World")));
+
        okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
                .willReturn(WireMock.aResponse()
@@ -117,6 +133,15 @@ class HttpFetcherImplFetchTest {
                        .withHeader("Keep-Alive", "max=4, timeout=30")
                        .withBody("Hello")
                        ));
+
+
+        pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "application/pdf")
+                        .withStatus(200)
+                        .withBody("Hello World")));
+
        wireMockServer.start();

    }
@@ -134,20 +159,31 @@ class HttpFetcherImplFetchTest {
    public void setUp() throws IOException {
        fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
        warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
-        warcRecorder = new WarcRecorder(warcFile, fetcher);
+        warcRecorder = new WarcRecorder(warcFile);
    }

    @AfterEach
    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
+        System.out.println(stats);
+
        fetcher.close();
        warcRecorder.close();
        Files.deleteIfExists(warcFile);
    }


+    @Test
+    public void testFoo() {
+        fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
+    }
+
    @Test
    public void testOk_NoProbe() throws IOException {
-        var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
@@ -158,12 +194,29 @@ class HttpFetcherImplFetchTest {
        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));

        WarcResponse response = (WarcResponse) warcRecords.get(1);
-        assertEquals("0", response.headers().first("X-Has-Cookies").orElse("0"));
+        assertEquals("0", response.http().headers().first("X-Has-Cookies").orElse("0"));
+    }
+
+    @Test
+    public void testOkSetsCookie() throws IOException {
+        var cookies = new DomainCookies();
+        var result = fetcher.fetchContent(okUrlSetsCookie, warcRecorder, cookies, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+
+        List<WarcRecord> warcRecords = getWarcRecords();
+        assertEquals(2, warcRecords.size());
+        Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
+        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
+
+        WarcResponse response = (WarcResponse) warcRecords.get(1);
+        assertEquals("1", response.http().headers().first("X-Has-Cookies").orElse("0"));
    }

    @Test
    public void testOk_FullProbe() {
-        var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
@@ -171,7 +224,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testOk304_NoProbe() {
-        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
        System.out.println(result);
@@ -180,7 +233,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testOk304_FullProbe() {
-        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
        System.out.println(result);
@@ -188,7 +241,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testBadStatus_NoProbe() throws IOException {
-        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertFalse(result.isOk());
@@ -202,7 +255,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testBadStatus_FullProbe() {
-        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertFalse(result.isOk());
@@ -212,7 +265,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testRedirect_NoProbe() throws URISyntaxException, IOException {
-        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
        assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
@@ -225,7 +278,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testRedirect_FullProbe() throws URISyntaxException {
-        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
        assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
@@ -238,7 +291,7 @@ class HttpFetcherImplFetchTest {
    public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
        Instant requestStart = Instant.now();

-        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);

@@ -262,7 +315,7 @@ class HttpFetcherImplFetchTest {

    @Test
    public void testRangeResponse() throws IOException {
-        var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
@@ -279,7 +332,7 @@ class HttpFetcherImplFetchTest {
    @Test
    public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
        Instant requestStart = Instant.now();
-        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
        Instant requestEnd = Instant.now();

        Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
@@ -302,7 +355,15 @@ class HttpFetcherImplFetchTest {
    @Test
    public void testKeepaliveUrl() {
        // mostly for smoke testing and debugger utility
-        var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+        var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+    }
+
+    @Test
+    public void testPdf() {
+        var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);

        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
        Assertions.assertTrue(result.isOk());
@@ -319,6 +380,13 @@ class HttpFetcherImplFetchTest {
            WarcXEntityRefused.register(reader);

            for (var record : reader) {
+                // Load the body, we need to do this before we close the reader to have access to the content.
+                if (record instanceof WarcRequest req) {
+                    req.http();
+                } else if (record instanceof WarcResponse rsp) {
+                    rsp.http();
+                }
+
                records.add(record);
            }
        }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
@@ -1,12 +1,12 @@
 package nu.marginalia.crawl.retreival;

+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
-import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {

    @Test
    void run() throws IOException, URISyntaxException {
-        try (var oldRecorder = new WarcRecorder(fileName, new BasicCookieStore())) {
+        try (var oldRecorder = new WarcRecorder(fileName)) {
            fetchUrl(oldRecorder, "https://www.marginalia.nu/");
            fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
            fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {

        var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);

-        try (var newRecorder = new WarcRecorder(outputFile, new BasicCookieStore())) {
+        try (var newRecorder = new WarcRecorder(outputFile)) {
            new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
        }

@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
    }

    void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        var req = ClassicRequestBuilder.get(new java.net.URI(url))
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .build();
-        recorder.fetch(httpClient, req);
+        HttpGet request = new HttpGet(url);
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+
+        recorder.fetch(httpClient, new DomainCookies(), request);
    }
 }
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;

 import com.sun.net.httpserver.HttpServer;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@@ -88,7 +89,7 @@ class ContentTypeProberTest {

    @Test
    void probeContentTypeOk() throws Exception {
-        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
+        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());

        System.out.println(result);

@@ -97,7 +98,7 @@ class ContentTypeProberTest {

    @Test
    void probeContentTypeRedir() throws Exception {
-        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
+        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());

        System.out.println(result);

@@ -106,7 +107,7 @@ class ContentTypeProberTest {

    @Test
    void probeContentTypeBad() throws Exception {
-        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
+        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());

        System.out.println(result);

@@ -115,7 +116,7 @@ class ContentTypeProberTest {

    @Test
    void probeContentTypeTimeout() throws Exception {
-        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
+        HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());

        System.out.println(result);

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderFakeServerTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderFakeServerTest.java
@@ -1,11 +1,11 @@
 package nu.marginalia.crawl.retreival.fetcher;

 import com.sun.net.httpserver.HttpServer;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
-import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.junit.jupiter.api.*;
 import org.netpreserve.jwarc.WarcReader;
 import org.netpreserve.jwarc.WarcRequest;
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
                os.write("<html><body>hello</body></html>".getBytes());
                os.flush();
                try {
-                    TimeUnit.SECONDS.sleep(1);
+                    TimeUnit.SECONDS.sleep(2);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
                os.write(":".getBytes());
                os.flush();
                try {
-                    TimeUnit.SECONDS.sleep(1);
+                    TimeUnit.SECONDS.sleep(2);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
@@ -89,24 +89,22 @@ class WarcRecorderFakeServerTest {
        fileNameWarc = Files.createTempFile("test", ".warc");
        fileNameParquet = Files.createTempFile("test", ".parquet");

-        client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
+        client = new WarcRecorder(fileNameWarc);
    }

    @AfterEach
    public void tearDown() throws Exception {
+
        client.close();
        Files.delete(fileNameWarc);
    }

    @Test
    public void fetchFast() throws Exception {
-        client.fetch(httpClient,
-                ClassicRequestBuilder
-                        .get(new java.net.URI("http://localhost:14510/fast"))
-                        .addHeader("User-agent", "test.marginalia.nu")
-                        .addHeader("Accept-Encoding", "gzip")
-                        .build()
-        );
+        HttpGet request = new HttpGet("http://localhost:14510/fast");
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+        client.fetch(httpClient, new DomainCookies(), request);

        Map<String, String> sampleData = new HashMap<>();
        try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -127,11 +125,13 @@ class WarcRecorderFakeServerTest {
    public void fetchSlow() throws Exception {
        Instant start = Instant.now();

+        HttpGet request = new HttpGet("http://localhost:14510/slow");
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+
        client.fetch(httpClient,
-                ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow"))
-                        .addHeader("User-agent", "test.marginalia.nu")
-                        .addHeader("Accept-Encoding", "gzip")
-                        .build(),
+                new DomainCookies(),
+                request,
                Duration.ofSeconds(1)
        );
        Instant end = Instant.now();
@@ -149,6 +149,8 @@ class WarcRecorderFakeServerTest {
            });
        }

+        System.out.println(
+                Files.readString(fileNameWarc));
        System.out.println(sampleData);

        // Timeout is set to 1 second, but the server will take 5 seconds to respond,
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java
@@ -2,14 +2,14 @@ package nu.marginalia.crawl.retreival.fetcher;

 import nu.marginalia.UserAgent;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
-import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
+import nu.marginalia.slop.SlopCrawlDataRecord;
 import org.apache.hc.client5.http.classic.HttpClient;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
 import org.apache.hc.client5.http.impl.classic.HttpClients;
-import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -24,13 +24,14 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.NoSuchAlgorithmException;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;

 import static org.junit.jupiter.api.Assertions.assertEquals;

 class WarcRecorderTest {
    Path fileNameWarc;
-    Path fileNameParquet;
+    Path fileNameSlop;
    WarcRecorder client;

    HttpClient httpClient;
@@ -39,9 +40,9 @@ class WarcRecorderTest {
        httpClient = HttpClients.createDefault();

        fileNameWarc = Files.createTempFile("test", ".warc");
-        fileNameParquet = Files.createTempFile("test", ".parquet");
+        fileNameSlop = Files.createTempFile("test", ".slop.zip");

-        client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
+        client = new WarcRecorder(fileNameWarc);
    }

    @AfterEach
@@ -52,12 +53,12 @@ class WarcRecorderTest {

    @Test
    void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        client.fetch(httpClient,
-                ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/"))
-                        .addHeader("User-agent", "test.marginalia.nu")
-                        .addHeader("Accept-Encoding", "gzip")
-                        .build()
-        );
+
+        HttpGet request = new HttpGet("https://www.marginalia.nu/");
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+
+        client.fetch(httpClient, new DomainCookies(), request);

        Map<String, String> sampleData = new HashMap<>();
        try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -78,8 +79,9 @@ class WarcRecorderTest {
    @Test
    public void flagAsSkipped() throws IOException, URISyntaxException {

-        try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
+        try (var recorder = new WarcRecorder(fileNameWarc)) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
+                    new DomainCookies(),
                    "text/html",
                    200,
                    "<?doctype html><html><body>test</body></html>".getBytes(),
@@ -102,8 +104,9 @@ class WarcRecorderTest {
    @Test
    public void flagAsSkippedNullBody() throws IOException, URISyntaxException {

-        try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
+        try (var recorder = new WarcRecorder(fileNameWarc)) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
+                    new DomainCookies(),
                    "text/html",
                    200,
                    null,
@@ -114,8 +117,9 @@ class WarcRecorderTest {

    @Test
    public void testSaveImport() throws URISyntaxException, IOException {
-        try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
+        try (var recorder = new WarcRecorder(fileNameWarc)) {
            recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
+                    new DomainCookies(),
                    "text/html",
                    200,
                    "<?doctype html><html><body>test</body></html>".getBytes(),
@@ -138,35 +142,46 @@ class WarcRecorderTest {

    @Test
    public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        client.fetch(httpClient, ClassicRequestBuilder
-                .get(new java.net.URI("https://www.marginalia.nu/"))
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .build());
+        HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
+        request1.addHeader("User-agent", "test.marginalia.nu");
+        request1.addHeader("Accept-Encoding", "gzip");

-        client.fetch(httpClient, ClassicRequestBuilder
-                .get(new java.net.URI("https://www.marginalia.nu/log/"))
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .build());
+        client.fetch(httpClient, new DomainCookies(), request1);

-        client.fetch(httpClient, ClassicRequestBuilder
-                .get(new java.net.URI("https://www.marginalia.nu/sanic.png"))
-                .addHeader("User-agent", "test.marginalia.nu")
-                .addHeader("Accept-Encoding", "gzip")
-                .build());
+        HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
+        request2.addHeader("User-agent", "test.marginalia.nu");
+        request2.addHeader("Accept-Encoding", "gzip");

-        CrawledDocumentParquetRecordFileWriter.convertWarc(
+        client.fetch(httpClient, new DomainCookies(), request2);
+
+        HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
+        request3.addHeader("User-agent", "test.marginalia.nu");
+        request3.addHeader("Accept-Encoding", "gzip");
+
+        client.fetch(httpClient, new DomainCookies(), request3);
+
+        HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
+        request4.addHeader("User-agent", "test.marginalia.nu");
+        request4.addHeader("Accept-Encoding", "gzip");
+
+        client.fetch(httpClient, new DomainCookies(), request4);
+
+        SlopCrawlDataRecord.convertWarc(
                "www.marginalia.nu",
                new UserAgent("test", "test"),
                fileNameWarc,
-                fileNameParquet);
+                fileNameSlop);

-        var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
-        assertEquals(2, urls.size());
+        List<String> urls;
+        try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
+            urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
+        }
+
+        assertEquals(3, urls.size());
        assertEquals("https://www.marginalia.nu/", urls.get(0));
        assertEquals("https://www.marginalia.nu/log/", urls.get(1));
        // sanic.jpg gets filtered out for its bad mime type
+        assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));

    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java
@@ -1,6 +1,7 @@
 package nu.marginalia.crawling;

 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@@ -31,7 +32,7 @@ class HttpFetcherTest {
    void fetchUTF8() throws Exception {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
        try (var recorder = new WarcRecorder()) {
-            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
            if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
                System.out.println(bodyOk.contentType());
            }
@@ -49,7 +50,7 @@ class HttpFetcherTest {
        var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");

        try (var recorder = new WarcRecorder()) {
-            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+            var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
            if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
                System.out.println(bodyOk.contentType());
            }
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java
@@ -3,10 +3,7 @@ package nu.marginalia.crawling.retreival;
 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
-import nu.marginalia.crawl.fetcher.ContentTags;
-import nu.marginalia.crawl.fetcher.HttpFetcher;
-import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
-import nu.marginalia.crawl.fetcher.SitemapRetriever;
+import nu.marginalia.crawl.fetcher.*;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
 import nu.marginalia.crawl.retreival.CrawlerRetreiver;
@@ -137,7 +134,7 @@ public class CrawlerMockFetcherTest {
        }

        @Override
-        public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
+        public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, DomainCookies cookies, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
            logger.info("Fetching {}", url);
            if (mockData.containsKey(url)) {
                byte[] bodyBytes = mockData.get(url).documentBodyBytes;
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -16,7 +16,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
 import nu.marginalia.slop.SlopCrawlDataRecord;
-import org.apache.hc.client5.http.cookie.BasicCookieStore;
 import org.jetbrains.annotations.NotNull;
 import org.junit.jupiter.api.*;
 import org.netpreserve.jwarc.*;
@@ -118,6 +117,100 @@ class CrawlerRetreiverTest {
        }
    }

+
+    @Test
+    public void verifyFileFormatSupport() throws IOException {
+        List<String> urls = List.of(
+                "https://www.marginalia.nu/junk/test.pdf",
+                "https://www.marginalia.nu/junk/test.md"
+        );
+
+        var specs = CrawlerMain.CrawlSpecRecord
+                .builder()
+                .crawlDepth(5)
+                .domain("www.marginalia.nu")
+                .urls(urls)
+                .build();
+        Path tempFile = null;
+        Path slopFile = null;
+        try {
+            tempFile = Files.createTempFile("crawling-process", "warc");
+            slopFile = Files.createTempFile("crawling-process", ".slop.zip");
+
+            doCrawl(tempFile, specs);
+
+            Set<String> requests = new HashSet<>();
+            Set<String> responses = new HashSet<>();
+
+            // Inspect the WARC file
+            try (var reader = new WarcReader(tempFile)) {
+                reader.forEach(record -> {
+                    if (record instanceof WarcRequest req) {
+                        requests.add(req.target());
+                        System.out.println(req.type() + ":" + req.target());
+                    }
+                    else if (record instanceof WarcResponse rsp) {
+                        responses.add(rsp.target());
+                        try {
+                            System.out.println(rsp.type() + ":" + rsp.target() + ":" + rsp.http().contentType());
+                        } catch (IOException e) {
+                            throw new RuntimeException(e);
+                        }
+                    }
+                    else {
+                        System.out.println(record.type());
+                    }
+                });
+            }
+
+            for (var url : urls) {
+                assertTrue(requests.contains(url), "Should have requested " + url);
+            }
+            assertEquals(requests, responses);
+
+            // Convert the WARC file to a Slop file
+            SlopCrawlDataRecord
+                    .convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
+
+            CrawledDomain domain = null;
+            Map<String, CrawledDocument> documents = new HashMap<>();
+
+            // Extract the contents of the Slop file
+            try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
+                while (stream.hasNext()) {
+                    var doc = stream.next();
+                    if (doc instanceof CrawledDomain dr) {
+                        assertNull(domain);
+                        domain = dr;
+                    }
+                    else if (doc instanceof CrawledDocument dc) {
+                        System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
+                        documents.put(dc.url, dc);
+                    }
+                }
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+
+            for (var url : urls) {
+                // Verify we have the downloaded files in the Slop file
+                assertNotNull(domain);
+                var fetchedDoc = documents.get(url);
+                assertNotNull(fetchedDoc, "Should have a document for " + url);
+                assertEquals(url, fetchedDoc.url);
+                assertTrue(fetchedDoc.httpStatus == 200 || fetchedDoc.httpStatus == 206, "Should be 200 or 206 for " + url);
+                assertTrue(fetchedDoc.documentBodyBytes.length > 32, "Should have a body for " + url);
+            }
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        } finally {
+            if (tempFile != null)
+                Files.deleteIfExists(tempFile);
+            if (slopFile != null)
+                Files.deleteIfExists(slopFile);
+        }
+    }
+
    @Test
    public void testWarcOutputNoKnownUrls() throws IOException {
        var specs = CrawlerMain.CrawlSpecRecord
@@ -180,7 +273,7 @@ class CrawlerRetreiverTest {
                new EdgeDomain("www.marginalia.nu"),
                List.of(), 100);
        var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
-                new WarcRecorder(tempFileWarc2, new BasicCookieStore())
+                new WarcRecorder(tempFileWarc2)
        );

        // truncate the size of the file to simulate a crash
@@ -456,7 +549,7 @@ class CrawlerRetreiverTest {
                List.of(), 100);

        var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
-                new WarcRecorder(tempFileWarc3, new BasicCookieStore())
+                new WarcRecorder(tempFileWarc3)
        );

        // truncate the size of the file to simulate a crash
@@ -507,7 +600,7 @@ class CrawlerRetreiverTest {
    }

    private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
-        try (var recorder = new WarcRecorder(tempFileWarc2, new BasicCookieStore());
+        try (var recorder = new WarcRecorder(tempFileWarc2);
             var db = new DomainStateDb(tempFileDb)
        ) {
            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
@@ -519,7 +612,7 @@ class CrawlerRetreiverTest {

    @NotNull
    private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
-        try (var recorder = new WarcRecorder(tempFileWarc1, new BasicCookieStore());
+        try (var recorder = new WarcRecorder(tempFileWarc1);
             var db = new DomainStateDb(tempFileDb)
        ) {
            var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
--- a/code/processes/export-task-process/build.gradle
+++ b/code/processes/export-task-process/build.gradle
@@ -53,6 +53,8 @@ dependencies {
    implementation libs.commons.compress
    implementation libs.commons.codec
    implementation libs.jsoup
+    implementation libs.slop
+    implementation libs.jwarc



--- a/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
+++ b/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
@@ -1,13 +1,18 @@
 package nu.marginalia.extractor;

 import com.google.inject.Inject;
+import nu.marginalia.process.control.ProcessHeartbeat;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.process.log.WorkLogEntry;
+import nu.marginalia.slop.SlopCrawlDataRecord;
+import nu.marginalia.slop.SlopTablePacker;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageId;
 import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
 import org.apache.commons.compress.utils.IOUtils;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;

 import java.io.IOException;
 import java.nio.file.Files;
@@ -16,18 +21,19 @@ import java.nio.file.StandardCopyOption;
 import java.nio.file.StandardOpenOption;
 import java.nio.file.attribute.PosixFilePermissions;
 import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;

 public class SampleDataExporter {
    private final FileStorageService storageService;
+    private final ProcessHeartbeat processHeartbeat;

    @Inject
-    public SampleDataExporter(FileStorageService storageService) {
+    public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
        this.storageService = storageService;
+        this.processHeartbeat = processHeartbeat;
    }
-    public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException {
+
+    public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
        FileStorage destStorage = storageService.getStorage(destId);
        Path inputDir = storageService.getStorage(crawlId).asPath();

@@ -54,11 +60,6 @@ public class SampleDataExporter {

        Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
-        try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
-            for (var item : entriesAll) {
-                bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
-            }
-        }

        Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
@@ -67,14 +68,38 @@ public class SampleDataExporter {
        var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));

-        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
-            for (var item : entriesAll) {
+        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
+             var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+             var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
+        ) {
+            for (var item : hb.wrap("Scanning", entriesAll)) {
                Path crawlDataPath = inputDir.resolve(item.relPath());
                if (!Files.exists(crawlDataPath)) continue;

-                addFileToTar(stream, crawlDataPath, item.relPath());
+                if (StringUtils.isBlank(ctFilter)) {
+                    addFileToTar(stream, crawlDataPath, item.relPath());
+                    logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
+                }
+                else /* filter != null */ {
+                    Path filteredData = null;
+                    try {
+                        filteredData = filterEntries(crawlDataPath, ctFilter);
+                        addFileToTar(stream, filteredData, item.relPath());
+                        logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
+                    }
+                    catch (NoSuchElementException ex) {
+                        // Ignore
+                    }
+                    finally {
+                        if (filteredData != null) {
+                            Files.deleteIfExists(filteredData);
+                        }
+                    }
+                }
            }

+            logWriter.flush();
+
            addFileToTar(stream, newCrawlerLogFile, "crawler.log");
            addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
        }
@@ -86,6 +111,56 @@ public class SampleDataExporter {
        Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
    }

+    /** Filters the entries in the crawl data file based on the content type. */
+    private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
+        Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
+        Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
+
+        // We may have debris from a previous run, so let's clean it up
+        if (Files.isDirectory(tempDir)) {
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
+        Files.createDirectory(tempDir);
+
+        boolean wroteEntry = false;
+
+        try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
+             var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
+                 @Override
+                 public boolean filter(String url, int status, String contentType) {
+                     return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
+                                || contentType.startsWith("x-marginalia/"); // metadata records
+                 }
+             }
+        ) {
+
+            while (reader.hasRemaining()) {
+                var entry = reader.get();
+                writer.write(entry);
+
+                wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
+            }
+        }
+        catch (Exception ex) {
+            FileUtils.deleteDirectory(tempDir.toFile());
+            throw ex;
+        }
+
+        try {
+            if (!wroteEntry) {
+                throw new NoSuchElementException("No relevant entries");
+            }
+
+            SlopTablePacker.packToSlopZip(tempDir, tempFile);
+        }
+        finally {
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
+
+
+        return tempFile;
+    }
+
    private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
        var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
        entry.setSize(Files.size(file));
--- a/code/processes/export-task-process/java/nu/marginalia/task/ExportTasksMain.java
+++ b/code/processes/export-task-process/java/nu/marginalia/task/ExportTasksMain.java
@@ -92,7 +92,7 @@ public class ExportTasksMain extends ProcessMainClass {
                    termFrequencyExporter.export(request.crawlId, request.destId);
                    break;
                case SAMPLE_DATA:
-                    sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name);
+                    sampleDataExporter.export(request.crawlId, request.destId, request.size, request.ctFilter, request.name);
                    break;
                case ADJACENCIES:
                    websiteAdjacenciesCalculator.export();
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java
@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
                            headers,
                            body,
                            false,
+                            -1,
                            "",
                            ""
                    ));
--- a/code/processes/process-mq-api/java/nu/marginalia/mqapi/tasks/ExportTaskRequest.java
+++ b/code/processes/process-mq-api/java/nu/marginalia/mqapi/tasks/ExportTaskRequest.java
@@ -16,6 +16,7 @@ public class ExportTaskRequest {
    public FileStorageId destId;
    public int size;
    public String name;
+    public String ctFilter;

    public ExportTaskRequest(Task task) {
        this.task = task;
@@ -42,12 +43,13 @@ public class ExportTaskRequest {
        return request;
    }

-    public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, int size, String name) {
+    public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, String ctFilter, int size, String name) {
        ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA);
        request.crawlId = crawlId;
        request.destId = destId;
        request.size = size;
        request.name = name;
+        request.ctFilter = ctFilter;
        return request;
    }

--- a/code/services-application/api-service/build.gradle
+++ b/code/services-application/api-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.4'
+    id 'com.google.cloud.tools.jib' version '3.4.5'
 }

 java {
--- a/Show More
+++ b/Show More