(pdf) Fix crash for some bold lines

(deploy) Redeploy all services.
(legacy-search) Soften domain limit constraints in URL deduplication
2025-10-05 21:22:39 +02:00 · 2025-05-18 13:05:05 +02:00 · 2025-05-17 13:11:51 +02:00 · 2025-05-17 00:04:27 +02:00 · 2025-05-17 00:00:42 +02:00 · 2025-05-17 00:00:28 +02:00
171 changed files with 7391 additions and 1376 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -5,7 +5,7 @@ plugins {

    // This is a workaround for a bug in the Jib plugin that causes it to stall randomly
    // https://github.com/GoogleContainerTools/jib/issues/3347
-    id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
+    id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
 }

 group 'marginalia'
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
 }

 ext {
-    jvmVersion=23
-    dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
+    jvmVersion = 24
+    dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
    dockerImageTag='latest'
    dockerImageRegistry='marginalia'
-    jibVersion = '3.4.4'
-
+    jibVersion = '3.4.5'
 }

 idea {
--- a/code/common/db/java/nu/marginalia/db/DbDomainQueries.java
+++ b/code/common/db/java/nu/marginalia/db/DbDomainQueries.java
@@ -22,6 +22,7 @@ public class DbDomainQueries {
    private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);

    private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
+    private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
    private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
    private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();

@@ -59,6 +60,34 @@ public class DbDomainQueries {
        }
    }

+
+    public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
+        try {
+            return domainWithNodeCache.get(domain, () -> {
+                try (var connection = dataSource.getConnection();
+                     var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
+
+                    stmt.setString(1, domain.toString());
+                    var rsp = stmt.executeQuery();
+                    if (rsp.next()) {
+                        return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
+                    }
+                }
+                catch (SQLException ex) {
+                    throw new RuntimeException(ex);
+                }
+
+                throw new NoSuchElementException();
+            });
+        }
+        catch (UncheckedExecutionException ex) {
+            throw new NoSuchElementException();
+        }
+        catch (ExecutionException ex) {
+            throw new RuntimeException(ex.getCause());
+        }
+    }
+
    public OptionalInt tryGetDomainId(EdgeDomain domain) {

        Integer maybeId = domainIdCache.getIfPresent(domain);
@@ -145,4 +174,6 @@ public class DbDomainQueries {
            return nodeAffinity > 0;
        }
    }
+
+    public record DomainIdWithNode (int domainId, int nodeAffinity) { }
 }
--- a/code/common/model/java/nu/marginalia/model/DocumentFormat.java
+++ b/code/common/model/java/nu/marginalia/model/DocumentFormat.java
@@ -0,0 +1,24 @@
+package nu.marginalia.model;
+
+public enum DocumentFormat {
+    PLAIN(0, 1, "text"),
+    PDF(0, 1, "pdf"),
+    UNKNOWN(0, 1, "???"),
+    HTML123(0, 1, "html"),
+    HTML4(-0.1, 1.05, "html"),
+    XHTML(-0.1, 1.05, "html"),
+    HTML5(0.5, 1.1, "html");
+
+    /** Used to tune quality score */
+    public final double offset;
+    /** Used to tune quality score */
+    public final double scale;
+    public final String shortFormat;
+
+    DocumentFormat(double offset, double scale, String shortFormat) {
+        this.offset = offset;
+        this.scale = scale;
+        this.shortFormat = shortFormat;
+    }
+
+}
--- a/code/common/model/java/nu/marginalia/model/EdgeDomain.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeDomain.java
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
    @Nonnull
    public final String topDomain;

-    public EdgeDomain(String host) {
+    public EdgeDomain(@Nonnull String host) {
        Objects.requireNonNull(host, "domain name must not be null");

        host = host.toLowerCase();
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
        this.topDomain = topDomain;
    }

+    public static String getTopDomain(String host) {
+        return new EdgeDomain(host).topDomain;
+    }
+
    private boolean looksLikeGovTld(String host) {
        if (host.length() < 8)
            return false;
@@ -108,32 +112,6 @@ public class EdgeDomain implements Serializable {
        return topDomain;
    }

-    public String getDomainKey() {
-        int cutPoint = topDomain.indexOf('.');
-        if (cutPoint < 0) {
-            return topDomain;
-        }
-        return topDomain.substring(0, cutPoint).toLowerCase();
-    }
-
-    public String getLongDomainKey() {
-        StringBuilder ret = new StringBuilder();
-
-        int cutPoint = topDomain.indexOf('.');
-        if (cutPoint < 0) {
-            ret.append(topDomain);
-        } else {
-            ret.append(topDomain, 0, cutPoint);
-        }
-
-        if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
-            ret.append(":");
-            ret.append(subDomain);
-        }
-
-        return ret.toString().toLowerCase();
-    }
-
    /** If possible, try to provide an alias domain,
     * i.e. a domain name that is very likely to link to this one
     * */
--- a/code/common/model/java/nu/marginalia/model/EdgeUrl.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeUrl.java
@@ -1,16 +1,14 @@
 package nu.marginalia.model;

 import nu.marginalia.util.QueryParams;
+import org.apache.commons.lang3.StringUtils;

 import javax.annotation.Nullable;
 import java.io.Serializable;
-import java.net.MalformedURLException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
+import java.net.*;
+import java.nio.charset.StandardCharsets;
 import java.util.Objects;
 import java.util.Optional;
-import java.util.regex.Pattern;

 public class EdgeUrl implements Serializable {
    public final String proto;
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {

    private static URI parseURI(String url) throws URISyntaxException {
        try {
-            return new URI(urlencodeFixer(url));
+            return EdgeUriFactory.parseURILenient(url);
        } catch (URISyntaxException ex) {
            throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
        }
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
        }
    }

-    private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
-
-    /* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
-
-       Here on the Internet, standards are like the picture on the box of the frozen pizza,
-       and what you get is more like what's on the inside, we try to patch things instead,
-       just give it a best-effort attempt att cleaning out broken or unnecessary constructions
-       like bad or missing URLEncoding
-     */
-    public static String urlencodeFixer(String url) throws URISyntaxException {
-        var s = new StringBuilder();
-        String goodChars = "&.?:/-;+$#";
-        String hexChars = "0123456789abcdefABCDEF";
-
-        int pathIdx = findPathIdx(url);
-        if (pathIdx < 0) { // url looks like http://marginalia.nu
-            return url + "/";
-        }
-        s.append(url, 0, pathIdx);
-
-        // We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
-        int end = url.indexOf("#");
-        if (end < 0) end = url.length();
-
-        for (int i = pathIdx; i < end; i++) {
-            int c = url.charAt(i);
-
-            if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
-                s.appendCodePoint(c);
-            } else if (c == '%' && i + 2 < end) {
-                int cn = url.charAt(i + 1);
-                int cnn = url.charAt(i + 2);
-                if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
-                    s.appendCodePoint(c);
-                } else {
-                    s.append("%25");
-                }
-            } else {
-                s.append(String.format("%%%02X", c));
-            }
-        }
-
-        return s.toString();
-    }
-
-    private static int findPathIdx(String url) throws URISyntaxException {
-        int colonIdx = url.indexOf(':');
-        if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
-            throw new URISyntaxException(url, "Lacking protocol");
-        }
-        return url.indexOf('/', colonIdx + 2);
-    }

    public EdgeUrl(URI URI) {
        try {
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
            sb.append(port);
        }

+        EdgeUriFactory.urlencodePath(sb, path);
+
+        if (param != null) {
+            EdgeUriFactory.urlencodeQuery(sb, param);
+        }
+
+        return sb.toString();
+    }
+
+
+    public String toDisplayString() {
+        StringBuilder sb = new StringBuilder(256);
+
+        sb.append(proto);
+        sb.append("://");
+        sb.append(domain);
+
+        if (port != null) {
+            sb.append(':');
+            sb.append(port);
+        }
+
        sb.append(path);

        if (param != null) {
-            sb.append('?');
-            sb.append(param);
+            sb.append('?').append(param);
        }

        return sb.toString();
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
    }

 }
+
+class EdgeUriFactory {
+    public static URI parseURILenient(String url) throws URISyntaxException {
+
+        if (shouldOmitUrlencodeRepair(url)) {
+            try {
+                return new URI(url);
+            }
+            catch (URISyntaxException ex) {
+                // ignore and run the lenient parser
+            }
+        }
+
+        var s = new StringBuilder(url.length()+8);
+
+        int pathIdx = findPathIdx(url);
+        if (pathIdx < 0) { // url looks like http://marginalia.nu
+            return new URI(url + "/");
+        }
+        s.append(url, 0, pathIdx);
+
+        // We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
+        int end = url.indexOf("#");
+        if (end < 0) end = url.length();
+
+        int queryIdx = url.indexOf('?');
+        if (queryIdx < 0) queryIdx = end;
+
+        urlencodePath(s, url.substring(pathIdx, queryIdx));
+        if (queryIdx < end) {
+            urlencodeQuery(s, url.substring(queryIdx + 1, end));
+        }
+        return new URI(s.toString());
+    }
+
+    /** Break apart the path element of an URI into its components, and then
+     * urlencode any component that needs it, and recombine it into a single
+     * path element again.
+     */
+    public static void urlencodePath(StringBuilder sb, String path) {
+        if (path == null || path.isEmpty()) {
+            return;
+        }
+
+        String[] pathParts = StringUtils.split(path, '/');
+        if (pathParts.length == 0) {
+            sb.append('/');
+            return;
+        }
+
+        boolean shouldUrlEncode = false;
+        for (String pathPart : pathParts) {
+            if (pathPart.isEmpty()) continue;
+
+            if (needsUrlEncode(pathPart)) {
+                shouldUrlEncode = true;
+                break;
+            }
+        }
+
+        for (String pathPart : pathParts) {
+            if (pathPart.isEmpty()) continue;
+
+            if (shouldUrlEncode) {
+                sb.append('/');
+                sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
+            } else {
+                sb.append('/');
+                sb.append(pathPart);
+            }
+        }
+
+        if (path.endsWith("/")) {
+            sb.append('/');
+        }
+
+    }
+
+    /** Break apart the query element of a URI into its components, and then
+     * urlencode any component that needs it, and recombine it into a single
+     * query element again.
+     */
+    public static void urlencodeQuery(StringBuilder sb, String param) {
+        if (param == null || param.isEmpty()) {
+            return;
+        }
+
+        String[] queryParts = StringUtils.split(param, '&');
+
+        boolean shouldUrlEncode = false;
+        for (String queryPart : queryParts) {
+            if (queryPart.isEmpty()) continue;
+
+            if (needsUrlEncode(queryPart)) {
+                shouldUrlEncode = true;
+                break;
+            }
+        }
+
+        boolean first = true;
+        for (String queryPart : queryParts) {
+            if (queryPart.isEmpty()) continue;
+
+            if (first) {
+                sb.append('?');
+                first = false;
+            } else {
+                sb.append('&');
+            }
+
+            if (shouldUrlEncode) {
+                int idx = queryPart.indexOf('=');
+                if (idx < 0) {
+                    sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
+                } else {
+                    sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
+                    sb.append('=');
+                    sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
+                }
+            } else {
+                sb.append(queryPart);
+            }
+        }
+    }
+
+    /** Test if the url element needs URL encoding.
+     * <p></p>
+     * Note we may have been given an already encoded path element,
+     * so we include % and + in the list of good characters
+     */
+    static boolean needsUrlEncode(String urlElement) {
+        for (int i = 0; i < urlElement.length(); i++) {
+            char c = urlElement.charAt(i);
+
+            if (isUrlSafe(c)) continue;
+            if ("+".indexOf(c) >= 0) continue;
+            if (c == '%' && i + 2 < urlElement.length()) {
+                char c1 = urlElement.charAt(i + 1);
+                char c2 = urlElement.charAt(i + 2);
+                if (isHexDigit(c1) && isHexDigit(c2)) {
+                    i += 2;
+                    continue;
+                }
+            }
+
+            return true;
+        }
+
+        return false;
+    }
+
+
+    static boolean isUrlSafe(int c) {
+        if (c >= 'a' && c <= 'z') return true;
+        if (c >= 'A' && c <= 'Z') return true;
+        if (c >= '0' && c <= '9') return true;
+        if (c == '-' || c == '_' || c == '.' || c == '~') return true;
+
+        return false;
+    }
+
+    /** Test if the URL is a valid URL that does not need to be
+     * urlencoded.
+     * <p></p>
+     * This is a very simple heuristic test that does not guarantee
+     * that the URL is valid, but it will identify cases where we
+     * are fairly certain that the URL does not need encoding,
+     * so we can skip a bunch of allocations and string operations
+     * that would otherwise be needed to fix the URL.
+     */
+    static boolean shouldOmitUrlencodeRepair(String url) {
+        int idx = 0;
+        final int len = url.length();
+
+        // Validate the scheme
+        while (idx < len - 2) {
+            char c = url.charAt(idx++);
+            if (c == ':') break;
+            if (!isAsciiAlphabetic(c)) return false;
+        }
+        if (url.charAt(idx++) != '/') return false;
+        if (url.charAt(idx++) != '/') return false;
+
+        // Validate the authority
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '/') break;
+            if (c == ':') continue;
+            if (c == '@') continue;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        // Validate the path
+        if (idx >= len) return true;
+
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '?') break;
+            if (c == '/') continue;
+            if (c == '#') return true;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        if (idx >= len) return true;
+
+        // Validate the query
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '&') continue;
+            if (c == '=') continue;
+            if (c == '#') return true;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        return true;
+    }
+
+
+    private static boolean isAsciiAlphabetic(int c) {
+        return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    private static boolean isHexDigit(int c) {
+        return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    /** Find the index of the path element in a URL.
+     * <p></p>
+     * The path element starts after the scheme and authority part of the URL,
+     * which is everything up to and including the first slash after the colon.
+     */
+    private static int findPathIdx(String url) throws URISyntaxException {
+        int colonIdx = url.indexOf(':');
+        if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
+            throw new URISyntaxException(url, "Lacking scheme");
+        }
+        return url.indexOf('/', colonIdx + 3);
+    }
+
+
+}
--- a/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java
+++ b/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java
@@ -28,6 +28,8 @@ public enum HtmlFeature {

    GA_SPAM("special:gaspam"),

+    PDF("format:pdf"),
+
    /** For fingerprinting and ranking */
    OPENGRAPH("special:opengraph"),
    OPENGRAPH_IMAGE("special:opengraph:image"),
--- a/code/common/model/java/nu/marginalia/model/html/HtmlStandard.java
+++ b/code/common/model/java/nu/marginalia/model/html/HtmlStandard.java
@@ -1,22 +0,0 @@
-package nu.marginalia.model.html;
-
-// This class really doesn't belong anywhere, but will squat here for now
-public enum HtmlStandard {
-    PLAIN(0, 1),
-    UNKNOWN(0, 1),
-    HTML123(0, 1),
-    HTML4(-0.1, 1.05),
-    XHTML(-0.1, 1.05),
-    HTML5(0.5, 1.1);
-
-    /** Used to tune quality score */
-    public final double offset;
-    /** Used to tune quality score */
-    public final double scale;
-
-    HtmlStandard(double offset, double scale) {
-        this.offset = offset;
-        this.scale = scale;
-    }
-
-}
--- a/code/common/model/java/nu/marginalia/model/idx/DocumentFlags.java
+++ b/code/common/model/java/nu/marginalia/model/idx/DocumentFlags.java
@@ -9,7 +9,7 @@ public enum DocumentFlags {
    GeneratorForum,
    GeneratorWiki,
    Sideloaded,
-    Unused7,
+    PdfFile,
    Unused8,
    ;

--- a/code/common/model/test/nu/marginalia/model/EdgeDomainTest.java
+++ b/code/common/model/test/nu/marginalia/model/EdgeDomainTest.java
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;

 class EdgeDomainTest {

-    @Test
-    public void testSkepdic() throws URISyntaxException {
-        var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
-        assertEquals("skepdic", domain.getDomain().getDomainKey());
-        var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
-        assertEquals("skepdic", domain2.getDomain().getDomainKey());
-    }
-
    @Test
    public void testHkDomain() throws URISyntaxException {
        var domain = new EdgeUrl("http://l7072i3.l7c.net");
--- a/code/common/model/test/nu/marginalia/model/EdgeUrlTest.java
+++ b/code/common/model/test/nu/marginalia/model/EdgeUrlTest.java
@@ -1,6 +1,6 @@
 package nu.marginalia.model;

-import nu.marginalia.model.EdgeUrl;
+import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;

 import java.net.URISyntaxException;
@@ -21,25 +21,70 @@ class EdgeUrlTest {
                new EdgeUrl("https://memex.marginalia.nu/#here")
        );
    }
+
    @Test
-    public void testParam() throws URISyntaxException {
-        System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
-        System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
-    }
-    @Test
-    void urlencodeFixer() throws URISyntaxException {
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
-        System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
+    void testUriFromString() throws URISyntaxException {
+        // We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
+        // converting it back to a string, we want to ensure there is no changes along the way.
+
+        Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
+        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
+        Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
+
+        Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
+        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
+        Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
+
+        Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
+        Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
+        Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
+
+        Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
+
+        Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
+
+        Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
+
+        Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
+        Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
+        Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
+
+        Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
+        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
+        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
+
+        Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
    }

    @Test
    void testParms() throws URISyntaxException {
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
-        System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
+        Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
+
+        Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
+
+        Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
+
+        Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
+                new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
+
+
+        Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
+
+        Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
+
+        Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
+        Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
    }
 }
--- a/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java
+++ b/code/common/service/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java
@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
     */
    @Override
    public void progress(String step, int stepProgress, int stepCount) {
+        int lastProgress = this.progress;
        this.step = step;
-
-
-        // off by one since we calculate the progress based on the number of steps,
-        // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
-        // final progress being 80% and not 100%)
-
        this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);

-        logger.info("ProcessTask {} progress: {}%", taskBase, progress);
+        if (this.progress / 10 != lastProgress / 10) {
+            logger.info("ProcessTask {} progress: {}%", taskBase, progress);
+        }
    }

    /** Wrap a collection to provide heartbeat progress updates as it's iterated through */
--- a/code/common/service/java/nu/marginalia/service/control/ServiceAdHocTaskHeartbeatImpl.java
+++ b/code/common/service/java/nu/marginalia/service/control/ServiceAdHocTaskHeartbeatImpl.java
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
     */
    @Override
    public void progress(String step, int stepProgress, int stepCount) {
+        int lastProgress = this.progress;
        this.step = step;
-
-
-        // off by one since we calculate the progress based on the number of steps,
-        // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
-        // final progress being 80% and not 100%)
-
        this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);

-        logger.info("ServiceTask {} progress: {}%", taskBase, progress);
+        if (this.progress / 10 != lastProgress / 10) {
+            logger.info("ProcessTask {} progress: {}%", taskBase, progress);
+        }
    }

    public void shutDown() {
--- a/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java
+++ b/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java
@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {

        while (nets.hasMoreElements()) {
            NetworkInterface netif = nets.nextElement();
+            logger.info("Considering network interface {}:  Up? {},  Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
            if (!netif.isUp() || netif.isLoopback()) {
                continue;
            }
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
            Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
            while (inetAddresses.hasMoreElements()) {
                InetAddress addr = inetAddresses.nextElement();
+                logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
                if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
                    return addr.getHostAddress();
                }
--- a/code/common/service/java/nu/marginalia/service/server/JoobyService.java
+++ b/code/common/service/java/nu/marginalia/service/server/JoobyService.java
@@ -122,6 +122,11 @@ public class JoobyService {
        // single digit percentage difference since HTML already compresses very well with level = 1.
        options.setCompressionLevel(1);

+        // Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
+        // multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
+        // scenario
+        options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
+

        jooby.setServerOptions(options);

--- a/code/common/service/java/nu/marginalia/service/server/MetricsServer.java
+++ b/code/common/service/java/nu/marginalia/service/server/MetricsServer.java
@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;

 public class MetricsServer {

-    private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
+    private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);

    @Inject
    public MetricsServer(ServiceConfiguration configuration) {
@@ -30,6 +30,8 @@ public class MetricsServer {

            context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");

+            logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
+
            server.start();
        }
        catch (Exception|NoSuchMethodError ex) {
--- a/code/common/service/java/nu/marginalia/service/server/RateLimiter.java
+++ b/code/common/service/java/nu/marginalia/service/server/RateLimiter.java
@@ -35,21 +35,8 @@ public class RateLimiter {
    }


-    public static RateLimiter forExpensiveRequest() {
-        return new RateLimiter(5, 10);
-    }
-
    public static RateLimiter custom(int perMinute) {
-        return new RateLimiter(perMinute, 60);
-    }
-
-    public static RateLimiter forSpamBots() {
-        return new RateLimiter(120, 3600);
-    }
-
-
-    public static RateLimiter forLogin() {
-        return new RateLimiter(3, 15);
+        return new RateLimiter(4 * perMinute, perMinute);
    }

    private void cleanIdleBuckets() {
@@ -62,7 +49,7 @@ public class RateLimiter {
    }

    private Bucket createBucket() {
-        var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
+        var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
        var bw = Bandwidth.classic(capacity, refill);
        return Bucket.builder().addLimit(bw).build();
    }
--- a/code/common/service/resources/log4j2-json.xml
+++ b/code/common/service/resources/log4j2-json.xml
@@ -3,8 +3,16 @@
        <Console name="Console" target="SYSTEM_OUT">
            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
            </Filters>
        </Console>
        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
@@ -13,15 +21,29 @@
            <Filters>
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
            </Filters>
            <SizeBasedTriggeringPolicy size="10MB" />
        </RollingFile>
+        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
+                     ignoreExceptions="false">
+            <PatternLayout>
+                <Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
+            </PatternLayout>
+            <SizeBasedTriggeringPolicy size="100MB" />
+            <Filters>
+                <MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
+            </Filters>
+        </RollingFile>
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
            <AppenderRef ref="Console"/>
+            <AppenderRef ref="ProcessConsole"/>
            <AppenderRef ref="LogToFile"/>
        </Root>
    </Loggers>
--- a/code/common/service/resources/log4j2-prod.xml
+++ b/code/common/service/resources/log4j2-prod.xml
@@ -1,10 +1,49 @@
 <Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
    <Appenders>
-        <Console name="Console" target="SYSTEM_OUT">
-            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
+        <Console name="ConsoleInfo" target="SYSTEM_OUT">
+            <PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
            <Filters>
+                <LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleWarn" target="SYSTEM_OUT">
+            <PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleError" target="SYSTEM_OUT">
+            <PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleFatal" target="SYSTEM_OUT">
+            <PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
            </Filters>
        </Console>
        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
@@ -17,14 +56,30 @@
                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
                <MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
+                <MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </RollingFile>
+        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
+                     ignoreExceptions="false">
+            <PatternLayout>
+                <Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
+            </PatternLayout>
+            <SizeBasedTriggeringPolicy size="100MB" />
+            <Filters>
+                <MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
            </Filters>
        </RollingFile>
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
-            <AppenderRef ref="Console"/>
+            <AppenderRef ref="ConsoleInfo"/>
+            <AppenderRef ref="ConsoleWarn"/>
+            <AppenderRef ref="ConsoleError"/>
+            <AppenderRef ref="ConsoleFatal"/>
+            <AppenderRef ref="ProcessConsole"/>
            <AppenderRef ref="LogToFile"/>
        </Root>
    </Loggers>
--- a/code/common/service/resources/log4j2-test.xml
+++ b/code/common/service/resources/log4j2-test.xml
@@ -1,15 +1,50 @@
 <Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
    <Appenders>
-        <Console name="Console" target="SYSTEM_OUT">
-            <PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1}  --  %msg%n"/>
+        <Console name="ConsoleInfo" target="SYSTEM_OUT">
+            <PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleWarn" target="SYSTEM_OUT">
+            <PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleError" target="SYSTEM_OUT">
+            <PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ConsoleFatal" target="SYSTEM_OUT">
+            <PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
+            <Filters>
+                <LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
+                <MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
+            </Filters>
+        </Console>
+        <Console name="ProcessConsole" target="SYSTEM_OUT">
+            <PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
+            <Filters>
+                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
+            </Filters>
        </Console>
    </Appenders>
    <Loggers>
        <Logger name="org.apache.zookeeper" level="WARN" />
-
+        <Logger name="org.apache.pdfbox" level="ERROR" />
+        <Logger name="org.apache.fontbox.ttf" level="ERROR" />
        <Root level="info">
-            <AppenderRef ref="Console"/>
-            <AppenderRef ref="LogToFile"/>
+            <AppenderRef ref="ConsoleInfo"/>
+            <AppenderRef ref="ConsoleWarn"/>
+            <AppenderRef ref="ConsoleError"/>
+            <AppenderRef ref="ConsoleFatal"/>
+            <AppenderRef ref="ProcessConsole"/>
        </Root>
    </Loggers>
 </Configuration>
--- a/code/common/service/test/nu/marginalia/service/discovery/ZkServiceRegistryTest.java
+++ b/code/common/service/test/nu/marginalia/service/discovery/ZkServiceRegistryTest.java
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
 class ZkServiceRegistryTest {
    private static final int ZOOKEEPER_PORT = 2181;
    private static final GenericContainer<?> zookeeper =
-            new GenericContainer<>("zookeeper:3.8.0")
+            new GenericContainer<>("zookeeper:3.8")
                    .withExposedPorts(ZOOKEEPER_PORT);

    List<ZkServiceRegistry> registries = new ArrayList<>();
--- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java
+++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
        return msgId;
    }

-    public void exportSampleData(int node, FileStorageId fid, int size, String name) {
+    public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
        channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
                .forNode(node)
                .run(RpcExportSampleData.newBuilder()
                        .setFileStorageId(fid.id())
                        .setSize(size)
+                        .setCtFilter(ctFilter)
                        .setName(name)
                        .build());
    }
--- a/code/execution/api/src/main/protobuf/executor-api.proto
+++ b/code/execution/api/src/main/protobuf/executor-api.proto
@@ -100,6 +100,7 @@ message RpcExportSampleData {
  int64 fileStorageId = 1;
  int32 size = 2;
  string name = 3;
+  string ctFilter = 4;
 }
 message RpcDownloadSampleData {
  string sampleSet = 1;
--- a/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/DownloadSampleActor.java
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
 import nu.marginalia.actor.state.ActorStep;
 import nu.marginalia.actor.state.Resume;
 import nu.marginalia.service.control.ServiceEventLog;
+import nu.marginalia.service.control.ServiceHeartbeat;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageId;
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.*;
+import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {

    private final FileStorageService storageService;
    private final ServiceEventLog eventLog;
+    private final ServiceHeartbeat heartbeat;
    private final Logger logger = LoggerFactory.getLogger(getClass());

    @Resume(behavior = ActorResumeBehavior.ERROR)
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {

                Files.deleteIfExists(Path.of(tarFileName));

-                try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
-                     var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
-                    is.transferTo(os);
+                HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
+
+                try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
+                    long size = urlConnection.getContentLengthLong();
+                    byte[] buffer = new byte[8192];
+
+                    try (var is = new BufferedInputStream(urlConnection.getInputStream());
+                         var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
+                        long copiedSize = 0;
+
+                        while (copiedSize < size) {
+                            int read = is.read(buffer);
+
+                            if (read < 0) // We've been promised a file of length 'size'
+                                throw new IOException("Unexpected end of stream");
+
+                            os.write(buffer, 0, read);
+                            copiedSize += read;
+
+                            // Update progress bar
+                            hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
+                        }
+                    }
+
                }
                catch (Exception ex) {
                    eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
                    logger.error("Error downloading sample", ex);
                    yield new Error();
                }
+                finally {
+                    urlConnection.disconnect();
+                }

                eventLog.logEvent(DownloadSampleActor.class, "Download complete");
                yield new Extract(fileStorageId, tarFileName);
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
    @Inject
    public DownloadSampleActor(Gson gson,
                               FileStorageService storageService,
-                               ServiceEventLog eventLog)
+                               ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
    {
        super(gson);
        this.storageService = storageService;
        this.eventLog = eventLog;
+        this.heartbeat = heartbeat;
    }

 }
--- a/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/ExportSampleDataActor.java
@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
    private final MqOutbox exportTasksOutbox;
    private final Logger logger = LoggerFactory.getLogger(getClass());

-    public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
-    public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
-        public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
-            this(crawlId, destId, size, name, -1);
+    public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
+    public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
+        public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
+            this(crawlId, destId, size, name, ctFilter,-1);
        }
    }

    @Override
    public ActorStep transition(ActorStep self) throws Exception {
        return switch(self) {
-            case Export(FileStorageId crawlId, int size, String name) -> {
+            case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
                var storage = storageService.allocateStorage(FileStorageType.EXPORT,
                        "crawl-sample-export",
                        "Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
                );

                if (storage == null) yield new Error("Bad storage id");
-                yield new Run(crawlId, storage.id(), size, name);
+                yield new Run(crawlId, storage.id(), size, ctFilter, name);
            }
-            case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
+            case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
                storageService.setFileStorageState(destId, FileStorageState.NEW);

-                long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
-                yield new Run(crawlId, destId, size, name, newMsgId);
+                long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
+                yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
            }
-            case Run(_, FileStorageId destId, _, _, long msgId) -> {
+            case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
                var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);

                if (rsp.state() != MqMessageState.OK) {
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {

    @Override
    public String describe() {
-        return "Export RSS/Atom feeds from crawl data";
+        return "Export sample crawl data";
    }

    @Inject
--- a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java
+++ b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
                    new ExportSampleDataActor.Export(
                            FileStorageId.of(request.getFileStorageId()),
                            request.getSize(),
+                            request.getCtFilter(),
                            request.getName()
                    )
            );
--- a/code/functions/favicon/api/build.gradle
+++ b/code/functions/favicon/api/build.gradle
@@ -0,0 +1,47 @@
+plugins {
+    id 'java'
+
+    id "com.google.protobuf" version "0.9.4"
+    id 'jvm-test-suite'
+}
+
+java {
+    toolchain {
+        languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
+    }
+}
+
+jar.archiveBaseName = 'favicon-api'
+
+apply from: "$rootProject.projectDir/protobuf.gradle"
+apply from: "$rootProject.projectDir/srcsets.gradle"
+
+dependencies {
+    implementation project(':code:common:model')
+    implementation project(':code:common:config')
+    implementation project(':code:common:service')
+
+    implementation libs.bundles.slf4j
+
+    implementation libs.prometheus
+    implementation libs.notnull
+    implementation libs.guava
+    implementation dependencies.create(libs.guice.get()) {
+        exclude group: 'com.google.guava'
+    }
+    implementation libs.gson
+    implementation libs.bundles.protobuf
+    implementation libs.guava
+    libs.bundles.grpc.get().each {
+        implementation dependencies.create(it) {
+            exclude group: 'com.google.guava'
+        }
+    }
+
+
+
+    testImplementation libs.bundles.slf4j.test
+    testImplementation libs.bundles.junit
+    testImplementation libs.mockito
+
+}
--- a/code/functions/favicon/api/java/nu/marginalia/api/favicon/FaviconClient.java
+++ b/code/functions/favicon/api/java/nu/marginalia/api/favicon/FaviconClient.java
@@ -0,0 +1,39 @@
+package nu.marginalia.api.favicon;
+
+import com.google.inject.Inject;
+import nu.marginalia.service.client.GrpcChannelPoolFactory;
+import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
+import nu.marginalia.service.discovery.property.ServiceKey;
+import nu.marginalia.service.discovery.property.ServicePartition;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Optional;
+
+public class FaviconClient {
+    private static final Logger logger = LoggerFactory.getLogger(FaviconClient.class);
+
+    private final GrpcMultiNodeChannelPool<FaviconAPIGrpc.FaviconAPIBlockingStub> channelPool;
+
+    @Inject
+    public FaviconClient(GrpcChannelPoolFactory factory) {
+        this.channelPool = factory.createMulti(
+                ServiceKey.forGrpcApi(FaviconAPIGrpc.class, ServicePartition.multi()),
+                FaviconAPIGrpc::newBlockingStub);
+    }
+
+    public record FaviconData(byte[] bytes, String contentType) {}
+
+
+    public Optional<FaviconData> getFavicon(String domain, int node) {
+        RpcFaviconResponse rsp = channelPool.call(FaviconAPIGrpc.FaviconAPIBlockingStub::getFavicon)
+                .forNode(node)
+                .run(RpcFaviconRequest.newBuilder().setDomain(domain).build());
+
+        if (rsp.getData().isEmpty())
+            return Optional.empty();
+
+        return Optional.of(new FaviconData(rsp.getData().toByteArray(), rsp.getContentType()));
+    }
+
+}
--- a/code/functions/favicon/api/src/main/protobuf/favicon.proto
+++ b/code/functions/favicon/api/src/main/protobuf/favicon.proto
@@ -0,0 +1,20 @@
+syntax="proto3";
+package marginalia.api.favicon;
+
+option java_package="nu.marginalia.api.favicon";
+option java_multiple_files=true;
+
+service FaviconAPI {
+  /** Fetches information about a domain. */
+  rpc getFavicon(RpcFaviconRequest) returns (RpcFaviconResponse) {}
+}
+
+message RpcFaviconRequest {
+  string domain = 1;
+}
+
+message RpcFaviconResponse {
+  string domain = 1;
+  bytes data = 2;
+  string contentType = 3;
+}
--- a/code/functions/favicon/build.gradle
+++ b/code/functions/favicon/build.gradle
@@ -0,0 +1,49 @@
+plugins {
+    id 'java'
+
+    id 'application'
+    id 'jvm-test-suite'
+}
+
+java {
+    toolchain {
+        languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
+    }
+}
+
+apply from: "$rootProject.projectDir/srcsets.gradle"
+
+dependencies {
+    implementation project(':code:common:config')
+    implementation project(':code:common:service')
+    implementation project(':code:common:model')
+    implementation project(':code:common:db')
+    implementation project(':code:functions:favicon:api')
+    implementation project(':code:processes:crawling-process')
+
+    implementation libs.bundles.slf4j
+
+    implementation libs.prometheus
+    implementation libs.guava
+    libs.bundles.grpc.get().each {
+        implementation dependencies.create(it) {
+            exclude group: 'com.google.guava'
+        }
+    }
+
+
+    implementation libs.notnull
+    implementation libs.guava
+    implementation dependencies.create(libs.guice.get()) {
+        exclude group: 'com.google.guava'
+    }
+    implementation dependencies.create(libs.spark.get()) {
+        exclude group: 'org.eclipse.jetty'
+    }
+
+    testImplementation libs.bundles.slf4j.test
+    testImplementation libs.bundles.junit
+    testImplementation libs.mockito
+
+
+}
--- a/code/functions/favicon/java/nu/marginalia/functions/favicon/FaviconGrpcService.java
+++ b/code/functions/favicon/java/nu/marginalia/functions/favicon/FaviconGrpcService.java
@@ -0,0 +1,48 @@
+package nu.marginalia.functions.favicon;
+
+import com.google.inject.Inject;
+import com.google.inject.Singleton;
+import com.google.protobuf.ByteString;
+import io.grpc.stub.StreamObserver;
+import nu.marginalia.api.favicon.FaviconAPIGrpc;
+import nu.marginalia.api.favicon.RpcFaviconRequest;
+import nu.marginalia.api.favicon.RpcFaviconResponse;
+import nu.marginalia.crawl.DomainStateDb;
+import nu.marginalia.service.server.DiscoverableService;
+
+import java.util.Optional;
+
+@Singleton
+public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implements DiscoverableService {
+    private final DomainStateDb domainStateDb;
+
+    @Inject
+    public FaviconGrpcService(DomainStateDb domainStateDb) {
+        this.domainStateDb = domainStateDb;
+    }
+
+    public boolean shouldRegisterService() {
+        return domainStateDb.isAvailable();
+    }
+
+    @Override
+    public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
+        Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());
+
+        RpcFaviconResponse response;
+        if (icon.isEmpty()) {
+            response = RpcFaviconResponse.newBuilder().build();
+        }
+        else {
+            var iconRecord = icon.get();
+            response = RpcFaviconResponse.newBuilder()
+                            .setContentType(iconRecord.contentType())
+                            .setDomain(request.getDomain())
+                            .setData(ByteString.copyFrom(iconRecord.imageData()))
+                            .build();
+        }
+
+        responseObserver.onNext(response);
+        responseObserver.onCompleted();
+    }
+}
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@@ -33,6 +33,7 @@ import java.sql.SQLException;
 import java.time.*;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
+import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -71,7 +72,7 @@ public class FeedFetcherService {
    public enum UpdateMode {
        CLEAN,
        REFRESH
-    };
+    }

    public void updateFeeds(UpdateMode updateMode) throws IOException {
        if (updating) // Prevent concurrent updates
@@ -87,6 +88,7 @@ public class FeedFetcherService {
                .followRedirects(HttpClient.Redirect.NORMAL)
                .version(HttpClient.Version.HTTP_2)
                .build();
+             ExecutorService fetchExecutor = Executors.newCachedThreadPool();
             FeedJournal feedJournal = FeedJournal.create();
             var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
        ) {
@@ -131,7 +133,7 @@ public class FeedFetcherService {

                        FetchResult feedData;
                        try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
-                            feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
+                            feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
                        } catch (Exception ex) {
                            feedData = new FetchResult.TransientError();
                        }
@@ -211,6 +213,7 @@ public class FeedFetcherService {

    private FetchResult fetchFeedData(FeedDefinition feed,
                                      HttpClient client,
+                                      ExecutorService executorService,
                                      @Nullable String ifModifiedSinceDate,
                                      @Nullable String ifNoneMatchTag)
    {
@@ -226,18 +229,27 @@ public class FeedFetcherService {
                    .timeout(Duration.ofSeconds(15))
                    ;

-            if (ifModifiedSinceDate != null) {
+            // Set the If-Modified-Since or If-None-Match headers if we have them
+            // though since there are certain idiosyncrasies in server implementations,
+            // we avoid setting both at the same time as that may turn a 304 into a 200.
+            if (ifNoneMatchTag != null) {
+                requestBuilder.header("If-None-Match", ifNoneMatchTag);
+            } else if (ifModifiedSinceDate != null) {
                requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
            }

-            if (ifNoneMatchTag != null) {
-                requestBuilder.header("If-None-Match", ifNoneMatchTag);
-            }

            HttpRequest getRequest = requestBuilder.build();

            for (int i = 0; i < 3; i++) {
-                HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
+
+                /* Note we need to use an executor to time-limit the send() method in HttpClient, as
+                 * its support for timeouts only applies to the time until response starts to be received,
+                 * and does not catch the case when the server starts to send data but then hangs.
+                 */
+                HttpResponse<byte[]> rs = executorService.submit(
+                        () -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
+                                .get(15, TimeUnit.SECONDS);

                if (rs.statusCode() == 429) { // Too Many Requests
                    int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/SimpleFeedParser.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/SimpleFeedParser.java
@@ -79,9 +79,17 @@ public class SimpleFeedParser {
                if (!link.isBlank())
                    break;
                var tag = element.getElementsByTag(attr).first();
+
                if (tag != null) {
-                    link = tag.text();
+                    String linkText = tag.text();
+
+                    if (linkText.isBlank()) {
+                        linkText = tag.attr("href");
+                    }
+
+                    link = linkText;
                }
+
            }

            ret.add(new ItemData(title, description, link, pubDate));
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
@@ -1,6 +1,7 @@
 package nu.marginalia.api.searchquery.model.results;

 import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import org.jetbrains.annotations.NotNull;

@@ -161,4 +162,14 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
    public String toString() {
        return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
    }
+
+    public String getShortFormat() {
+        try {
+            var df = DocumentFormat.valueOf(format);
+            return df.shortFormat;
+        }
+        catch (IllegalArgumentException e) {
+            return DocumentFormat.UNKNOWN.shortFormat;
+        }
+    }
 }
--- a/code/libraries/blocking-thread-pool/java/nu/marginalia/util/SimpleBlockingThreadPool.java
+++ b/code/libraries/blocking-thread-pool/java/nu/marginalia/util/SimpleBlockingThreadPool.java
@@ -23,16 +23,33 @@ public class SimpleBlockingThreadPool {
    private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);

    public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
+        this(name, poolSize, queueSize, ThreadType.PLATFORM);
+    }
+
+    public SimpleBlockingThreadPool(String name, int poolSize, int queueSize, ThreadType threadType) {
        tasks = new ArrayBlockingQueue<>(queueSize);

        for (int i = 0; i < poolSize; i++) {
-            Thread worker = new Thread(this::worker, name  + "[" + i + "]");
-            worker.setDaemon(true);
-            worker.start();
+
+            Thread.Builder threadBuilder = switch (threadType) {
+                case VIRTUAL -> Thread.ofVirtual();
+                case PLATFORM -> Thread.ofPlatform().daemon(true);
+            };
+
+            Thread worker = threadBuilder
+                    .name(name  + "[" + i + "]")
+                    .start(this::worker);
+
            workers.add(worker);
        }

    }
+
+    public enum ThreadType {
+        VIRTUAL,
+        PLATFORM
+    }
+
    public void submit(Task task) throws InterruptedException {
        tasks.put(task);
    }
--- a/code/processes/converting-process/build.gradle
+++ b/code/processes/converting-process/build.gradle
@@ -62,6 +62,7 @@ dependencies {
    implementation libs.jwarc

    implementation libs.jsoup
+    implementation libs.pdfbox

    implementation libs.guava
    implementation dependencies.create(libs.guice.get()) {
@@ -87,6 +88,8 @@ dependencies {
    implementation libs.commons.compress
    implementation libs.sqlite

+    implementation libs.bundles.httpcomponents
+
    testImplementation libs.bundles.slf4j.test
    testImplementation libs.bundles.junit
    testImplementation libs.mockito
--- a/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java
@@ -1,8 +1,8 @@
 package nu.marginalia.converting.model;

+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentMetadata;

 import javax.annotation.Nullable;
@@ -21,7 +21,7 @@ public class ProcessedDocumentDetails {
    public long hashCode;

    public Set<HtmlFeature> features;
-    public HtmlStandard standard;
+    public DocumentFormat format;

    public List<EdgeUrl> linksInternal;
    public List<EdgeUrl> linksExternal;
@@ -30,6 +30,6 @@ public class ProcessedDocumentDetails {
    public GeneratorType generator;

    public String toString() {
-        return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
+        return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.format + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
    }
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
 import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
 import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
+import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
 import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
 import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.model.EdgeDomain;
@@ -33,7 +34,8 @@ public class DocumentProcessor {
    private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
            "application/xhtml",
            "text/html",
-            "text/plain");
+            "text/plain",
+            "application/pdf");


    private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
@@ -42,12 +44,14 @@ public class DocumentProcessor {
    @Inject
    public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
                             PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
+                             PdfDocumentProcessorPlugin pdfDocumentProcessorPlugin,
                             AnchorTextKeywords anchorTextKeywords)
    {
        this.anchorTextKeywords = anchorTextKeywords;

        processorPlugins.add(htmlDocumentProcessorPlugin);
        processorPlugins.add(plainTextDocumentProcessorPlugin);
+        processorPlugins.add(pdfDocumentProcessorPlugin);
    }

    public ProcessedDocument process(CrawledDocument crawledDocument,
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.logic;

 import crawlercommons.utils.Strings;
 import nu.marginalia.converting.model.DisqualifiedException;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -17,7 +17,7 @@ import java.util.Set;
 public class DocumentValuator {

    public double getQuality(CrawledDocument crawledDocument,
-                             HtmlStandard htmlStandard,
+                             DocumentFormat htmlStandard,
                             Document parsedDocument,
                             int textLength) throws DisqualifiedException {

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/HtmlStandardExtractor.java
@@ -1,7 +1,7 @@
 package nu.marginalia.converting.processor.logic;

 import com.google.common.base.Strings;
-import nu.marginalia.model.html.HtmlStandard;
+import nu.marginalia.model.DocumentFormat;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.DocumentType;
 import org.slf4j.Logger;
@@ -12,54 +12,54 @@ public class HtmlStandardExtractor {

    private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);

-    public static HtmlStandard parseDocType(DocumentType docType) {
+    public static DocumentFormat parseDocType(DocumentType docType) {
        if (null == docType) {
-            return HtmlStandard.UNKNOWN;
+            return DocumentFormat.UNKNOWN;
        }

        String publicId = docType.publicId();
        if (Strings.isNullOrEmpty(publicId))
-            return HtmlStandard.HTML5;
+            return DocumentFormat.HTML5;

        publicId = publicId.toUpperCase();
        if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;
        }
        if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        }
        if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//SQ//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML 2"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML//EN"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-/W3C//DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-/W3C/DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//IETF//DTD HTML 3"))
-            return HtmlStandard.HTML123;
+            return DocumentFormat.HTML123;
        if (publicId.startsWith("-//W3C//DTD XHTML"))
-            return HtmlStandard.XHTML;
+            return DocumentFormat.XHTML;
        if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
-            return HtmlStandard.XHTML;
+            return DocumentFormat.XHTML;
        if (publicId.startsWith("-//W3C//DTD HTML"))
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;

        logger.debug("Unknown publicID standard {}", publicId);
-        return HtmlStandard.UNKNOWN;
+        return DocumentFormat.UNKNOWN;
    }

-    public static HtmlStandard sniffHtmlStandard(Document parsed) {
+    public static DocumentFormat sniffHtmlStandard(Document parsed) {
        int html4Attributes = 0;
        int html5Attributes = 0;

@@ -73,11 +73,11 @@ public class HtmlStandardExtractor {
            html4Attributes++;
        }
        if (html5Attributes > 0) {
-            return HtmlStandard.HTML5;
+            return DocumentFormat.HTML5;
        }
        if (html4Attributes > 0) {
-            return HtmlStandard.HTML4;
+            return DocumentFormat.HTML4;
        }
-        return HtmlStandard.HTML123;
+        return DocumentFormat.HTML123;
    }
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
@@ -7,11 +7,11 @@ import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.filter.LanguageFilter;
 import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;

 import javax.annotation.Nullable;
 import java.io.IOException;
@@ -73,7 +73,7 @@ public abstract class AbstractDocumentProcessorPlugin {
            return this;
        }

-        public MetaTagsBuilder addFormat(HtmlStandard standard) {
+        public MetaTagsBuilder addFormat(DocumentFormat standard) {

            add("format", standard);

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java
@@ -25,12 +25,12 @@ import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
 import nu.marginalia.link_parser.FeedExtractor;
 import nu.marginalia.link_parser.LinkParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import org.jsoup.nodes.Document;
@@ -137,8 +137,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin


        final int length = getLength(doc);
-        final HtmlStandard standard = getHtmlStandard(doc);
-        final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
+        final DocumentFormat format = getDocumentFormat(doc);
+        final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);

        if (isDisqualified(documentClass, url, quality, doc.title())) {
            throw new DisqualifiedException(DisqualificationReason.QUALITY);
@@ -152,7 +152,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        var ret = new ProcessedDocumentDetails();

        ret.length = length;
-        ret.standard = standard;
+        ret.format = format;
        ret.title = specialization.getTitle(doc, dld, crawledDocument.url);

        final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
@@ -161,7 +161,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        ret.quality = documentValuator.adjustQuality(quality, features);
        ret.hashCode = dld.localitySensitiveHashCode();

-        PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
+        PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, format, true);

        EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());

@@ -180,7 +180,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
                .addPubDate(pubDate)
                .addUrl(url)
                .addFeatures(features)
-                .addFormat(standard)
+                .addFormat(format)
                .addGenerator(generatorParts.keywords())
                .build();

@@ -316,12 +316,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
        return linkTerms;
    }

-    private HtmlStandard getHtmlStandard(Document doc) {
-        HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
-        if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
+    private DocumentFormat getDocumentFormat(Document doc) {
+        DocumentFormat format = HtmlStandardExtractor.parseDocType(doc.documentType());
+        if (DocumentFormat.UNKNOWN.equals(format)) {
            return HtmlStandardExtractor.sniffHtmlStandard(doc);
        }
-        return htmlStandard;
+        return format;
    }

    private int getLength(Document doc) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPlugin.java
@@ -0,0 +1,286 @@
+package nu.marginalia.converting.processor.plugin;
+
+import com.google.inject.Inject;
+import com.google.inject.name.Named;
+import nu.marginalia.converting.model.DisqualifiedException;
+import nu.marginalia.converting.model.ProcessedDocumentDetails;
+import nu.marginalia.converting.processor.DocumentClass;
+import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
+import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
+import nu.marginalia.keyword.DocumentKeywordExtractor;
+import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
+import nu.marginalia.language.filter.LanguageFilter;
+import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
+import nu.marginalia.model.EdgeUrl;
+import nu.marginalia.model.crawl.HtmlFeature;
+import nu.marginalia.model.crawl.PubDate;
+import nu.marginalia.model.crawldata.CrawledDocument;
+import nu.marginalia.model.idx.DocumentFlags;
+import nu.marginalia.model.idx.DocumentMetadata;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.text.HeadingAwarePDFTextStripper;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.time.LocalDate;
+import java.util.*;
+
+
+public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
+
+    private final int maxTitleLength;
+    private final DocumentKeywordExtractor keywordExtractor;
+    private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
+    private final DocumentLengthLogic documentLengthLogic;
+    private final DefaultSpecialization defaultSpecialization;
+
+    private static final Logger logger = LoggerFactory.getLogger(PdfDocumentProcessorPlugin.class);
+
+    @Inject
+    public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
+                                      LanguageFilter languageFilter,
+                                      ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
+                                      DocumentKeywordExtractor keywordExtractor,
+                                      DocumentLengthLogic documentLengthLogic,
+                                      DefaultSpecialization defaultSpecialization)
+
+    {
+        super(languageFilter);
+        this.sentenceExtractorProvider = sentenceExtractorProvider;
+        this.documentLengthLogic = documentLengthLogic;
+        this.maxTitleLength = maxTitleLength;
+        this.keywordExtractor = keywordExtractor;
+        this.defaultSpecialization = defaultSpecialization;
+    }
+
+    @Override
+    public boolean isApplicable(CrawledDocument doc) {
+        String contentType = doc.contentType.toLowerCase();
+
+        if (contentType.equals("application/pdf"))
+            return true;
+        if (contentType.startsWith("application/pdf;")) // charset=blabla
+            return true;
+
+        return false;
+    }
+
+    @Override
+    public DetailsWithWords createDetails(CrawledDocument crawledDocument,
+                                          LinkTexts linkTexts,
+                                          DocumentClass documentClass)
+            throws DisqualifiedException, URISyntaxException, IOException {
+
+        String documentBody = crawledDocument.documentBody();
+
+        if (languageFilter.isBlockedUnicodeRange(documentBody)) {
+            throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
+        }
+
+        final EdgeUrl url = new EdgeUrl(crawledDocument.url);
+
+
+        Document doc;
+        try {
+            doc = convertPdfToHtml(crawledDocument.documentBodyBytes);
+        } catch (IOException e) {
+            logger.error("Failed to convert PDF file {} - {}", url, e.getMessage());
+            throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ERROR);
+        }
+
+        DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
+
+        checkDocumentLanguage(dld);
+
+        documentLengthLogic.validateLength(dld, 1.0);
+
+        var ret = new ProcessedDocumentDetails();
+
+        ret.length = documentBody.length();
+
+        ret.format = DocumentFormat.PDF;
+        ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
+
+        ret.quality = -5;
+
+        ret.features = Set.of(HtmlFeature.PDF);
+        ret.description = getDescription(doc);
+        ret.hashCode = dld.localitySensitiveHashCode();
+
+        final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
+
+        EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PdfFile);
+
+        ret.metadata = new DocumentMetadata(
+                documentLengthLogic.getEncodedAverageLength(dld),
+                pubDate.yearByte(),
+                (int) -ret.quality,
+                documentFlags);
+
+        DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
+
+        var tagWords = new MetaTagsBuilder()
+                .addPubDate(pubDate)
+                .addUrl(url)
+                .addFeatures(ret.features)
+                .addFormat(ret.format)
+                .build();
+
+        words.addAllSyntheticTerms(tagWords);
+
+        if (pubDate.hasYear()) {
+            ret.pubYear = pubDate.year();
+        }
+
+        /* These are assumed to be populated */
+        ret.linksInternal = new ArrayList<>();
+        ret.linksExternal = new ArrayList<>();
+
+        return new DetailsWithWords(ret, words);
+    }
+
+    private String getDescription(Document doc) {
+        int cnt = 0;
+        boolean useNext = false;
+        for (var ptag : doc.getElementsByTag("p")) {
+            String text = ptag.text();
+
+            // Many academic documents have an abstract at the start of the document,
+            // which makes a nice summary.  Though they tend to bleed into the text,
+            // so we check for the word "Abstract" at the start of the paragraph.
+
+            if (text.startsWith("Abstract ")) {
+                return StringUtils.abbreviate(text.substring("Abstract ".length()), "...", 255);
+            }
+            else if (text.equals("Abstract")) {
+                useNext = true;
+            }
+            else if (useNext) {
+                return StringUtils.abbreviate(text, "...", 255);
+            }
+
+            if (++cnt > 15) { // Don't scan the entire document
+                break;
+            }
+        }
+
+        // Fall back to the default specialization
+        return defaultSpecialization.getSummary(doc, Set.of());
+
+    }
+
+    /** Convert the provided PDF bytes into a HTML rendering that can be fed
+     * to the HTML processor.
+     */
+    Document convertPdfToHtml(byte[] pdfBytes) throws IOException {
+        try (var doc = Loader.loadPDF(pdfBytes)) {
+            String docMetaTitle = Objects.requireNonNullElse(doc.getDocumentInformation().getTitle(), "");
+
+            var stripper = new HeadingAwarePDFTextStripper();
+            stripper.setStartPage(1);
+            stripper.setSortByPosition(true);
+            stripper.setWordSeparator(" ");
+
+            // Increase the tolerance for line spacing to deal better with paragraphs.
+            stripper.setDropThreshold(5f);
+
+            stripper.setPageStart("<div>");
+            stripper.setParagraphStart("<p>");
+            stripper.setParagraphEnd("</p>\n");
+            stripper.setPageEnd("</div>\n");
+            stripper.setHeadingStart("<h1>");
+            stripper.setHeadingEnd("</h1>\n");
+            stripper.setLineSeparator("\n");
+
+            String text = stripper.getText(doc);
+
+            StringBuilder htmlBuilder = new StringBuilder(text.length() + 1024);
+            htmlBuilder.append("<html><body>")
+                    .append(text)
+                    .append("</body></html>");
+
+            var parsed = Jsoup.parse(htmlBuilder.toString());
+
+            repairDOM(parsed);
+
+            for (var heading : parsed.getElementsByTag("h1")) {
+                String headingText = heading.text();
+                if (headingText.length() > 2) {
+                    parsed.title(headingText);
+                    break;
+                }
+            }
+
+
+            if (parsed.title().isEmpty()) {
+                // Prefer setting the title to the first paragraph in the
+                // document, as this is almost always correct.  Otherwise,
+                // we fall back on the metadata title, which is almost always
+                // useless
+
+                var firstP = parsed.getElementsByTag("p").first();
+                if (firstP != null) parsed.title(firstP.text());
+                else parsed.title(docMetaTitle);
+            }
+            return parsed;
+        }
+
+
+    }
+
+    /** Repair the DOM to remove some common issues with PDF conversion,
+     * including empty paragraphs, and multiline headers that are split into multiple
+     * conescutive h1 tags.
+     */
+    private void repairDOM(Document parsed) {
+
+        // <p><h1>...</h1></p> -> <h1>...</h1>
+        parsed.getElementsByTag("h1").forEach(h1 -> {
+            var parent = h1.parent();
+            if (parent == null || !"p".equals(parent.tagName())) {
+                return;
+            }
+
+            if (parent.childrenSize() == 1) {
+                parent.replaceWith(h1);
+            }
+        });
+
+        // Remove empty <p> tags
+        parsed.getElementsByTag("p").forEach(p -> {
+            if (p.childrenSize() == 0 && !p.hasText()) {
+                p.remove();
+            }
+        });
+
+        // <h1>...</h1><h1>...</h1> -> <h1>...</h1>
+        parsed.getElementsByTag("h1").forEach(h1 -> {
+            var nextSibling = h1.nextElementSibling();
+            if (nextSibling == null || !"h1".equals(nextSibling.tagName())) {
+                return; // Short-circuit to avoid unnecessary work
+            }
+
+            StringJoiner joiner = new StringJoiner(" ");
+            joiner.add(h1.text());
+
+            for (var sibling : h1.nextElementSiblings()) {
+                if (!"h1".equals(sibling.tagName()))
+                    break;
+                joiner.add(sibling.text());
+                sibling.remove();
+            }
+
+            h1.text(joiner.toString());
+        });
+
+    }
+
+}
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java
@@ -13,10 +13,10 @@ import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.language.filter.LanguageFilter;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import org.apache.commons.lang3.StringUtils;
@@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP

        ret.length = documentBody.length();

-        ret.standard = HtmlStandard.PLAIN;
+        ret.format = DocumentFormat.PLAIN;
        ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);

        ret.quality = -1;
@@ -113,7 +113,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
                .addPubDate(pubDate)
                .addUrl(url)
                .addFeatures(ret.features)
-                .addFormat(ret.standard)
+                .addFormat(ret.format)
                .build();

        words.addAllSyntheticTerms(tagWords);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java
@@ -1,12 +1,13 @@
 package nu.marginalia.converting.processor.pubdate;

-import nu.marginalia.model.html.HtmlStandard;
+import nu.marginalia.model.DocumentFormat;

 public class PubDateFromHtmlStandard {
    /** Used to bias pub date heuristics */
-    public static int blindGuess(HtmlStandard standard) {
-        return switch (standard) {
+    public static int blindGuess(DocumentFormat format) {
+        return switch (format) {
            case PLAIN -> 1993;
+            case PDF -> 2010;
            case HTML123 -> 1997;
            case HTML4, XHTML -> 2006;
            case HTML5 -> 2018;
@@ -21,8 +22,8 @@ public class PubDateFromHtmlStandard {
     * Discovering publication year involves a lot of guesswork, this helps
     * keep the guesses relatively sane.
     */
-    public static boolean isGuessPlausible(HtmlStandard standard, int year) {
-        switch (standard) {
+    public static boolean isGuessPlausible(DocumentFormat format, int year) {
+        switch (format) {
            case HTML123:
                return year <= 2000;
            case XHTML:
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java
@@ -1,14 +1,14 @@
 package nu.marginalia.converting.processor.pubdate;

 import nu.marginalia.converting.model.DocumentHeaders;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;

 public interface PubDateHeuristic {

-    Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
+    Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard);
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java
@@ -1,7 +1,7 @@
 package nu.marginalia.converting.processor.pubdate;

+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;

 import java.time.DateTimeException;
 import java.time.LocalDate;
@@ -26,7 +26,7 @@ public class PubDateParser {
                .filter(PubDateParser::validateDate);
    }

-    public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
+    public static Optional<PubDate> attemptParseDate(String date, DocumentFormat standard) {
        return Optional.ofNullable(date)
                .filter(str -> str.length() >= 4 && str.length() < 32)
                .flatMap(str ->
@@ -81,7 +81,7 @@ public class PubDateParser {
    }


-    public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
+    public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, DocumentFormat standard) {
        int guess = PubDateFromHtmlStandard.blindGuess(standard);

        var matcher = yearPattern.matcher(maybe);
@@ -135,7 +135,7 @@ public class PubDateParser {
        return (max + min) / 2;
    }

-    public static int guessYear(HtmlStandard standard) {
+    public static int guessYear(DocumentFormat standard) {
        // Create some jitter to avoid having documents piling up in the same four years
        // as this would make searching in those years disproportionately useless

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.pubdate;

 import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.heuristic.*;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.ArrayList;
@@ -38,7 +38,7 @@ public class PubDateSniffer {
        heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
    }

-    public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
+    public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard, boolean runExpensive) {
        final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;

        for (var heuristic : heuristics) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -19,7 +19,7 @@ import java.util.Optional;
 public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        if (effortLevel == PubDateEffortLevel.LOW)
            return Optional.empty();

@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {

    private static class DateExtractingNodeVisitorPass implements NodeFilter {
        public PubDate pubDate;
-        private final HtmlStandard htmlStandard;
+        private final DocumentFormat htmlStandard;

-        private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
+        private DateExtractingNodeVisitorPass(DocumentFormat htmlStandard) {
            this.htmlStandard = htmlStandard;
        }

@@ -135,7 +135,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
        }

        private void parse(String text) {
-            if (htmlStandard == HtmlStandard.UNKNOWN) {
+            if (htmlStandard == DocumentFormat.UNKNOWN) {
                PubDateParser
                        .dateFromHighestYearLookingSubstring(text)
                        .ifPresent(this::setPubDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java
@@ -5,9 +5,9 @@ import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;
@@ -19,7 +19,7 @@ import java.util.Optional;
 public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        if (effortLevel == PubDateEffortLevel.LOW)
            return Optional.empty();

@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {

    private static class DateExtractingNodeVisitor implements NodeFilter {
        public PubDate pubDate;
-        private final HtmlStandard htmlStandard;
+        private final DocumentFormat htmlStandard;

-        private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
+        private DateExtractingNodeVisitor(DocumentFormat htmlStandard) {
            this.htmlStandard = htmlStandard;
        }

@@ -73,7 +73,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
        }

        private void parse(String text) {
-            if (htmlStandard == HtmlStandard.UNKNOWN) {
+            if (htmlStandard == DocumentFormat.UNKNOWN) {
                PubDateParser
                        .dateFromHighestYearLookingSubstring(text)
                        .ifPresent(this::setPubDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,8 +14,8 @@ import java.util.Optional;
 public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
-        if (htmlStandard == HtmlStandard.UNKNOWN)
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
+        if (htmlStandard == DocumentFormat.UNKNOWN)
            return Optional.empty();

        return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // HTML5, alternative approach
        for (var tag : document.select("time")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // HTML5
        for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
            if (maybeDate.isPresent()) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java
@@ -8,9 +8,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Collections;
@@ -21,7 +21,7 @@ import java.util.Optional;
 public class PubDateHeuristicJSONLD implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("script[type=\"application/ld+json\"]")) {
            var maybeDate = parseLdJson(tag.data())
                    .flatMap(PubDateParser::attemptParseDate);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.List;
@@ -15,7 +15,7 @@ import java.util.Optional;
 public class PubDateHeuristicLastModified implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        List<String> lastModified = headers.get("last-modified");
        if (lastModified.isEmpty())
            return Optional.empty();
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicMicrodata implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {

        for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicOpenGraph implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        // OG
        for (var tag : document.select("meta[property=\"article:published_time\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -14,7 +14,7 @@ import java.util.Optional;
 public class PubDateHeuristicRDFaTag implements PubDateHeuristic {

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        for (var tag : document.select("meta[property=\"datePublished\"]")) {
            var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
            if (maybeDate.isPresent()) {
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -21,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
    private static final int MIN_URL_PATTERN_YEAR = 2000;

    @Override
-    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
+    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
        final String urlString = url.path;

        var matcher = yearUrlPattern.matcher(urlString);
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
 import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
 import nu.marginalia.converting.processor.pubdate.PubDateParser;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.nodes.Document;

 import java.util.Optional;
@@ -19,7 +19,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {

    @Override
    public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
-                                   Document document, HtmlStandard htmlStandard) {
+                                   Document document, DocumentFormat htmlStandard) {
        final String urlString = url.path;

        var matcher = yearUrlPattern.matcher(urlString);
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java
@@ -8,12 +8,12 @@ import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.DocumentClass;
 import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
 import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawl.UrlIndexingState;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import nu.marginalia.model.idx.WordFlags;
@@ -83,7 +83,7 @@ public class SideloaderProcessing {
            // that we can't get from the sideloaded data since it's
            // so stripped down

-            ret.details.standard = HtmlStandard.HTML5;
+            ret.details.format = DocumentFormat.HTML5;
            ret.details.pubYear = pubYear;
            ret.details.features.add(HtmlFeature.JS);
            ret.details.features.add(HtmlFeature.TRACKING);
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
@@ -9,13 +9,13 @@ import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
 import nu.marginalia.keyword.DocumentKeywordExtractor;
 import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.DomainIndexingState;
 import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.model.crawl.PubDate;
 import nu.marginalia.model.crawl.UrlIndexingState;
-import nu.marginalia.model.html.HtmlStandard;
 import nu.marginalia.model.idx.DocumentFlags;
 import nu.marginalia.model.idx.DocumentMetadata;
 import nu.marginalia.model.idx.WordFlags;
@@ -165,7 +165,7 @@ public class StackexchangeSideloader implements SideloadSource {
            ret.details.description = StringUtils.truncate(doc.body().text(), 255);
            ret.details.length = 128;

-            ret.details.standard = HtmlStandard.HTML5;
+            ret.details.format = DocumentFormat.HTML5;
            ret.details.linksExternal = List.of();
            ret.details.linksInternal = List.of();
            ret.state = UrlIndexingState.OK;
--- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java
@@ -124,7 +124,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
                    document.details.title,
                    document.details.description,
                    HtmlFeature.encode(document.details.features),
-                    document.details.standard.name(),
+                    document.details.format.name(),
                    document.details.length,
                    document.details.hashCode,
                    (float) document.details.quality,
--- a/code/processes/converting-process/java/org/apache/pdfbox/text/HeadingAwarePDFTextStripper.java
+++ b/code/processes/converting-process/java/org/apache/pdfbox/text/HeadingAwarePDFTextStripper.java
--- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java
@@ -6,6 +6,7 @@ import com.google.inject.Injector;
 import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.io.SerializableCrawlDataStream;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.crawl.DomainIndexingState;
 import nu.marginalia.model.crawl.PubDate;
@@ -13,7 +14,6 @@ import nu.marginalia.model.crawl.UrlIndexingState;
 import nu.marginalia.model.crawldata.CrawledDocument;
 import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
-import nu.marginalia.model.html.HtmlStandard;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
@@ -91,7 +91,7 @@ public class ConvertingIntegrationTest {

            assertTrue(details.title.length() > 4);
            assertTrue(details.description.length() > 4);
-            assertEquals(HtmlStandard.HTML5, details.standard);
+            assertEquals(DocumentFormat.HTML5, details.format);

        }
    }
@@ -125,7 +125,7 @@ public class ConvertingIntegrationTest {
            assertTrue(details.metadata.size() > 0);
            assertTrue(details.title.length() > 4);
            assertTrue(details.description.length() > 4);
-            assertEquals(HtmlStandard.HTML5, details.standard);
+            assertEquals(DocumentFormat.HTML5, details.format);
        }
    }

--- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java
@@ -8,7 +8,6 @@ import nu.marginalia.converting.model.ProcessedDomain;
 import nu.marginalia.converting.processor.DomainProcessor;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
-import nu.marginalia.crawl.fetcher.Cookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
    private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
        List<SerializableCrawlData> data = new ArrayList<>();

-        try (var recorder = new WarcRecorder(fileName, new Cookies());
+        try (var recorder = new WarcRecorder(fileName);
             var db = new DomainStateDb(dbTempFile))
        {
            new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPluginTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/PdfDocumentProcessorPluginTest.java
@@ -0,0 +1,95 @@
+package nu.marginalia.converting.processor.plugin;
+
+import nu.marginalia.WmsaHome;
+import nu.marginalia.converting.processor.DocumentClass;
+import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
+import nu.marginalia.converting.processor.logic.TitleExtractor;
+import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
+import nu.marginalia.converting.processor.summary.SummaryExtractor;
+import nu.marginalia.converting.processor.summary.heuristic.*;
+import nu.marginalia.keyword.DocumentKeywordExtractor;
+import nu.marginalia.keyword.LinkTexts;
+import nu.marginalia.language.filter.LanguageFilter;
+import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.crawldata.CrawledDocument;
+import nu.marginalia.term_frequency_dict.TermFrequencyDict;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.Instant;
+
+@Tag("flaky")
+class PdfDocumentProcessorPluginTest {
+    static PdfDocumentProcessorPlugin plugin;
+
+    @BeforeAll
+    static void setUpBeforeClass() throws Exception {
+        var lm = WmsaHome.getLanguageModels();
+        plugin = new PdfDocumentProcessorPlugin(255,
+                new LanguageFilter(lm),
+                new ThreadLocalSentenceExtractorProvider(lm),
+                new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
+                new DocumentLengthLogic(100),
+                new DefaultSpecialization(new SummaryExtractor(
+                        255,
+                        new DomFilterHeuristic(255),
+                        new TagDensityHeuristic(255),
+                        new OpenGraphDescriptionHeuristic(),
+                        new MetaDescriptionHeuristic(),
+                        new FallbackHeuristic()
+                ),
+                        new TitleExtractor(255)
+                        ));
+    }
+    public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
+        var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
+        return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
+    }
+
+    public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(Path file) throws Exception {
+        return testPdfFile(Files.readAllBytes(file));
+    }
+
+    private byte[] downloadPDF(String url) throws IOException, URISyntaxException {
+        HttpURLConnection conn = (HttpURLConnection) new URI(url).toURL().openConnection();
+        try {
+            return conn.getInputStream().readAllBytes();
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        } finally {
+            conn.disconnect();
+        }
+    }
+
+
+    @Disabled
+    @Test
+    void testingTool() throws Exception {
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample2.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample3.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample4.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample5.pdf")).details().title);
+        System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample6.pdf")).details().title);
+    }
+
+    @Disabled
+    @Test
+    void testingTool2() throws Exception {
+        System.out.println(plugin.convertPdfToHtml(Files.readAllBytes(Path.of("/home/st_work/Work/sample6.pdf"))));
+    }
+
+    @Test
+    void testMarginaliaSample() throws Exception {
+        var doc = plugin.convertPdfToHtml(downloadPDF("https://www.marginalia.nu/junk/test.pdf"));
+        System.out.println(doc.html());
+    }
+}
--- a/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java
+++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java
@@ -3,8 +3,8 @@ package nu.marginalia.converting.processor.pubdate;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.converting.model.DocumentHeaders;
 import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
+import nu.marginalia.model.DocumentFormat;
 import nu.marginalia.model.EdgeUrl;
-import nu.marginalia.model.html.HtmlStandard;
 import org.jsoup.Jsoup;
 import org.junit.jupiter.api.Test;

@@ -74,7 +74,7 @@ class PubDateSnifferTest {
                        <time pubdate="pubdate" datetime="2022-08-24">time</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -90,7 +90,7 @@ class PubDateSnifferTest {
                        <time>2022-08-24</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -106,7 +106,7 @@ class PubDateSnifferTest {
                        <time class="published" datetime="July 13, 2006">July 13, 2006</time>
                        Wow, sure lor 'em boss
                        </article>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals(2006, ret.year());
@@ -116,14 +116,14 @@ class PubDateSnifferTest {
    public void testProblemCases() throws IOException, URISyntaxException {
        var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
                new EdgeUrl("https://www.example.com/"),
-                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
+                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), DocumentFormat.HTML5, true);

        assertFalse(ret.isEmpty());
        assertEquals(2006, ret.year());

        ret = dateSniffer.getPubDate(new DocumentHeaders(""),
                new EdgeUrl("https://www.example.com/"),
-                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
+                Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), DocumentFormat.XHTML, true);

        assertFalse(ret.isEmpty());
        assertEquals(2010, ret.year());
@@ -146,7 +146,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <meta itemprop="datePublished" content="2022-08-24" />
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -160,7 +160,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <meta property="datePublished" content="2022-08-24" />
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-08-24", ret.dateIso8601());
@@ -174,7 +174,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2004-08-24", ret.dateIso8601());
@@ -188,7 +188,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2016-12-27", ret.dateIso8601());
@@ -202,7 +202,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <title>No date in the HTML</title>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
@@ -217,7 +217,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <title>No date in the HTML</title>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertEquals("2022-02-03", ret.dateIso8601());
@@ -232,7 +232,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <p>Published 2003, updated 2022</p>
-                        """), HtmlStandard.HTML5, true);
+                        """), DocumentFormat.HTML5, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
@@ -258,7 +258,7 @@ class PubDateSnifferTest {
                        <!doctype html>
                        <html>
                        <div style="float: left;">&nbsp;<b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a  href="./viewtopic.php?p=34580&amp;sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span>&nbsp;<b>Posted:</b> Sun Oct 03, 2010 5:37 pm&nbsp;</div>
-                        """), HtmlStandard.UNKNOWN, true);
+                        """), DocumentFormat.UNKNOWN, true);

        assertFalse(ret.isEmpty());
        assertNull(ret.dateIso8601());
--- a/code/processes/crawling-process/build.gradle
+++ b/code/processes/crawling-process/build.gradle
@@ -60,10 +60,12 @@ dependencies {
    implementation libs.fastutil

    implementation libs.bundles.mariadb
+    implementation libs.bundles.httpcomponents

    testImplementation libs.bundles.slf4j.test
    testImplementation libs.bundles.junit
    testImplementation libs.mockito
+    testImplementation libs.wiremock

    testImplementation project(':code:processes:test-data')
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
 import java.security.Security;
 import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;

@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {

    private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();

+    private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
+
    private final AtomicInteger tasksDone = new AtomicInteger(0);
    private final HttpFetcherImpl fetcher;

@@ -103,9 +106,18 @@ public class CrawlerMain extends ProcessMainClass {
        this.blacklist = blacklist;
        this.node = processConfiguration.node();

+        SimpleBlockingThreadPool.ThreadType threadType;
+        if (Boolean.getBoolean("crawler.useVirtualThreads")) {
+            threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
+        }
+        else {
+            threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
+        }
+
        pool = new SimpleBlockingThreadPool("CrawlerPool",
                Integer.getInteger("crawler.poolSize", 256),
-                1);
+                1,
+                threadType);


        // Wait for the blacklist to be loaded before starting the crawl
@@ -221,10 +233,7 @@ public class CrawlerMain extends ProcessMainClass {

        logger.info("Loaded {} domains", crawlSpecRecords.size());

-        // Shuffle the domains to ensure we get a good mix of domains in each crawl,
-        // so that e.g. the big domains don't get all crawled at once, or we end up
-        // crawling the same server in parallel from different subdomains...
-        Collections.shuffle(crawlSpecRecords);
+        crawlSpecRecords.sort(crawlSpecArrangement(crawlSpecRecords));

        // First a validation run to ensure the file is all good to parse
        if (crawlSpecRecords.isEmpty()) {
@@ -248,44 +257,51 @@ public class CrawlerMain extends ProcessMainClass {
            // List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
            // merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
            // this will more aggressively attempt to schedule the jobs to avoid blocking
-            List<CrawlTask> deferredTasks = new LinkedList<>();
+            List<CrawlTask> taskList = new ArrayList<>();

-            // Create crawl tasks and submit them to the pool for execution
+            // Create crawl tasks
            for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
-                if (workLog.isJobFinished(crawlSpec.domain()))
+                if (workLog.isJobFinished(crawlSpec.domain))
                    continue;

-                // Add to the end of the deferral list
-                deferredTasks.addLast(new CrawlTask(
-                        crawlSpec,
-                        anchorTagsSource,
-                        outputDir,
-                        warcArchiver,
-                        domainStateDb,
-                        workLog));
+                var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);

-                // Start every task we currently can from the deferral list
-                deferredTasks.removeIf(task -> {
-                    if (task.canRun()) {
-                        if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
-                            return true; // task has already run, duplicate in crawl specs
-                        }
+                // Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
+                if (!trySubmitDeferredTask(task)) {

-                        // This blocks the caller when the pool is full
-                        pool.submitQuietly(task);
-                        return true;
-                    }
+                    // Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
+                    retryQueue.drainTo(taskList);
+                    taskList.removeIf(this::trySubmitDeferredTask);

-                    return false;
-                });
+                    // Then add this new task to the retry queue
+                    taskList.add(task);
+                }
            }

-            // Schedule any lingering tasks for immediate execution
-            for (var task : deferredTasks) {
-                if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
-                    continue;
+             // Schedule viable tasks for execution until list is empty
+            for (int emptyRuns = 0;emptyRuns < 300;) {
+                boolean hasTasks = !taskList.isEmpty();

-                pool.submitQuietly(task);
+                // The order of these checks  very important to avoid a race condition
+                // where we miss a task that is put into the retry queue
+                boolean hasRunningTasks = pool.getActiveCount() > 0;
+                boolean hasRetryTasks = !retryQueue.isEmpty();
+
+                if (hasTasks || hasRetryTasks || hasRunningTasks) {
+                    retryQueue.drainTo(taskList);
+
+                    // Try to submit any tasks that are in the retry queue (this will block if the pool is full)
+                    taskList.removeIf(this::trySubmitDeferredTask);
+
+                    // Add a small pause here to avoid busy looping toward the end of the execution cycle when
+                    // we might have no new viable tasks to run for hours on end
+                    TimeUnit.MILLISECONDS.sleep(5);
+                } else {
+                    // We have no tasks to run, and no tasks in the retry queue
+                    // but we wait a bit to see if any new tasks come in via the retry queue
+                    emptyRuns++;
+                    TimeUnit.SECONDS.sleep(1);
+                }
            }

            logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -312,6 +328,52 @@ public class CrawlerMain extends ProcessMainClass {
        }
    }

+    /** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
+     * we want to enqueue domains that have common top domains first, but otherwise have a random
+     * order.
+     * <p></p>
+     * Note, we can't use hash codes for randomization as it is not desirable to have the same order
+     * every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
+     * hashcode based on the fields).
+     * */
+    private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
+        Random r = new Random();
+        Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
+        Map<String, Integer> randomOrder = new HashMap<>(records.size());
+
+        for (var spec : records) {
+            topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
+            randomOrder.put(spec.domain, r.nextInt());
+        }
+
+        return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
+                .reversed()
+                .thenComparing(spec -> randomOrder.get(spec.domain))
+                .thenComparing(Record::hashCode); // non-deterministic tie-breaker to
+    }
+
+    /** Submit a task for execution if it can be run, returns true if it was submitted
+     * or if it can be discarded */
+    private boolean trySubmitDeferredTask(CrawlTask task) {
+        if (!task.canRun()) {
+            return false;
+        }
+
+        if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
+            return true; // task has already run, duplicate in crawl specs
+        }
+
+        try {
+            // This blocks the caller when the pool is full
+            pool.submitQuietly(task);
+            return true;
+        }
+        catch (RuntimeException ex) {
+            logger.error("Failed to submit task " + task.domain, ex);
+            return false;
+        }
+    }
+
    public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
        runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
    }
@@ -371,72 +433,87 @@ public class CrawlerMain extends ProcessMainClass {
        /** Best effort indicator whether we could start this now without getting stuck in
         * DomainLocks purgatory */
        public boolean canRun() {
-            return domainLocks.canLock(new EdgeDomain(domain));
+            return domainLocks.isLockableHint(new EdgeDomain(domain));
        }

        @Override
        public void run() throws Exception {

-            Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
-            Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
-            Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
-
-            // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
-            // while writing to the same file name as before
-            if (Files.exists(newWarcFile)) {
-                Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
-            }
-            else {
-                Files.deleteIfExists(tempFile);
+            if (workLog.isJobFinished(domain)) { // No-Op
+                logger.info("Omitting task {}, as it is already run", domain);
+                return;
            }

-            try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
-                 var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
-                 CrawlDataReference reference = getReference()
-            )
-            {
-                // Resume the crawl if it was aborted
-                if (Files.exists(tempFile)) {
-                    retriever.syncAbortedRun(tempFile);
-                    Files.delete(tempFile);
-                }
-
-                DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
-
-                int size;
-                try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
-                    size = retriever.crawlDomain(domainLinks, reference);
-                }
-
-                // Delete the reference crawl data if it's not the same as the new one
-                // (mostly a case when migrating from legacy->warc)
-                reference.delete();
-
-                // Convert the WARC file to Parquet
-                SlopCrawlDataRecord
-                        .convertWarc(domain, userAgent, newWarcFile, slopFile);
-
-                // Optionally archive the WARC file if full retention is enabled,
-                // otherwise delete it:
-                warcArchiver.consumeWarc(newWarcFile, domain);
-
-                // Mark the domain as finished in the work log
-                workLog.setJobToFinished(domain, slopFile.toString(), size);
-
-                // Update the progress bar
-                heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
-
-                logger.info("Fetched {}", domain);
-            } catch (Exception e) {
-                logger.error("Error fetching domain " + domain, e);
-            }
-            finally {
-                // We don't need to double-count these; it's also kept int he workLog
+            Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
+            // We don't have a lock, so we can't run this task
+            // we return to avoid blocking the pool for too long
+            if (lock.isEmpty()) {
                pendingCrawlTasks.remove(domain);
-                Thread.currentThread().setName("[idle]");
+                retryQueue.put(this);
+                return;
+            }
+            DomainLocks.DomainLock domainLock = lock.get();

-                Files.deleteIfExists(newWarcFile);
-                Files.deleteIfExists(tempFile);
+            try (domainLock) {
+                Thread.currentThread().setName("crawling:" + domain);
+
+                Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
+                Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
+                Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
+
+                // Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
+                // while writing to the same file name as before
+                if (Files.exists(newWarcFile)) {
+                    Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
+                }
+                else {
+                    Files.deleteIfExists(tempFile);
+                }
+
+                try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
+                     var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
+                     CrawlDataReference reference = getReference())
+                {
+                    // Resume the crawl if it was aborted
+                    if (Files.exists(tempFile)) {
+                        retriever.syncAbortedRun(tempFile);
+                        Files.delete(tempFile);
+                    }
+
+                    DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
+
+                    int size = retriever.crawlDomain(domainLinks, reference);
+
+                    // Delete the reference crawl data if it's not the same as the new one
+                    // (mostly a case when migrating from legacy->warc)
+                    reference.delete();
+
+                    // Convert the WARC file to Slop
+                    SlopCrawlDataRecord
+                            .convertWarc(domain, userAgent, newWarcFile, slopFile);
+
+                    // Optionally archive the WARC file if full retention is enabled,
+                    // otherwise delete it:
+                    warcArchiver.consumeWarc(newWarcFile, domain);
+
+                    // Mark the domain as finished in the work log
+                    workLog.setJobToFinished(domain, slopFile.toString(), size);
+
+                    // Update the progress bar
+                    heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
+
+                    logger.info("Fetched {}", domain);
+                } catch (Exception e) {
+                    logger.error("Error fetching domain " + domain, e);
+                }
+                finally {
+                    // We don't need to double-count these; it's also kept in the workLog
+                    pendingCrawlTasks.remove(domain);
+                    Thread.currentThread().setName("[idle]");
+
+                    Files.deleteIfExists(newWarcFile);
+                    Files.deleteIfExists(tempFile);
+                }
            }
        }

@@ -453,7 +530,7 @@ public class CrawlerMain extends ProcessMainClass {
                    return new CrawlDataReference(slopPath);
                }

-            } catch (IOException e) {
+            } catch (Exception e) {
                logger.debug("Failed to read previous crawl data for {}", specification.domain());
            }

@@ -522,7 +599,7 @@ public class CrawlerMain extends ProcessMainClass {
    //
    // This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
    private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
-        if (!inputPath.endsWith(".parquet")) {
+        if (!inputPath.toString().endsWith(".parquet")) {
            return inputPath;
        }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java
@@ -1,5 +1,8 @@
 package nu.marginalia.crawl;

+import com.google.inject.Inject;
+import nu.marginalia.storage.FileStorageService;
+import nu.marginalia.storage.model.FileStorageType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@@ -8,6 +11,7 @@ import java.nio.file.Path;
 import java.sql.Connection;
 import java.sql.DriverManager;
 import java.sql.SQLException;
+import java.time.Duration;
 import java.time.Instant;
 import java.util.Objects;
 import java.util.Optional;
@@ -21,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {

    private final Connection connection;

+
+    public record CrawlMeta(
+            String domainName,
+            Instant lastFullCrawl,
+            Duration recrawlTime,
+            Duration crawlTime,
+            int recrawlErrors,
+            int crawlChanges,
+            int totalCrawlSize
+    ) {}
+
    public record SummaryRecord(
            String domainName,
            Instant lastUpdated,
@@ -63,7 +78,29 @@ public class DomainStateDb implements AutoCloseable {

    public record FaviconRecord(String contentType, byte[] imageData) {}

-    public DomainStateDb(Path filename) throws SQLException {
+    @Inject
+    public DomainStateDb(FileStorageService fileStorageService) throws SQLException {
+        this(findFilename(fileStorageService));
+    }
+
+    private static Path findFilename(FileStorageService fileStorageService) throws SQLException {
+        var fsId = fileStorageService.getOnlyActiveFileStorage(FileStorageType.CRAWL_DATA);
+
+        if (fsId.isPresent()) {
+            var fs = fileStorageService.getStorage(fsId.get());
+            return fs.asPath().resolve("domainstate.db");
+        }
+        else {
+            return null;
+        }
+    }
+
+    public DomainStateDb(@Nullable Path filename) throws SQLException {
+        if (null == filename) {
+            connection = null;
+            return;
+        }
+
        String sqliteDbString = "jdbc:sqlite:" + filename.toString();
        connection = DriverManager.getConnection(sqliteDbString);

@@ -77,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
                        feedUrl TEXT
                    )
                    """);
+            stmt.executeUpdate("""
+                    CREATE TABLE IF NOT EXISTS crawl_meta (
+                        domain TEXT PRIMARY KEY,
+                        lastFullCrawlEpochMs LONG NOT NULL,
+                        recrawlTimeMs LONG NOT NULL,
+                        recrawlErrors INTEGER NOT NULL,
+                        crawlTimeMs LONG NOT NULL,
+                        crawlChanges INTEGER NOT NULL,
+                        totalCrawlSize INTEGER NOT NULL
+                    )
+                    """);
            stmt.executeUpdate("""
                    CREATE TABLE IF NOT EXISTS favicon (
                        domain TEXT PRIMARY KEY,
@@ -90,11 +138,18 @@ public class DomainStateDb implements AutoCloseable {

    @Override
    public void close() throws SQLException {
-        connection.close();
+        if (connection != null) {
+            connection.close();
+        }
    }

+    public boolean isAvailable() {
+        return connection != null;
+    }

    public void saveIcon(String domain, FaviconRecord faviconRecord) {
+        if (connection == null) throw new IllegalStateException("No connection to domainstate db");
+
        try (var stmt = connection.prepareStatement("""
                INSERT OR REPLACE INTO favicon (domain, contentType, icon)
                       VALUES(?, ?, ?)
@@ -110,6 +165,9 @@ public class DomainStateDb implements AutoCloseable {
    }

    public Optional<FaviconRecord> getIcon(String domain) {
+        if (connection == null)
+            return Optional.empty();
+
        try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
            stmt.setString(1, domain);
            var rs = stmt.executeQuery();
@@ -129,7 +187,29 @@ public class DomainStateDb implements AutoCloseable {
        return Optional.empty();
    }

+    public void save(CrawlMeta crawlMeta) {
+        if (connection == null) throw new IllegalStateException("No connection to domainstate db");
+
+        try (var stmt = connection.prepareStatement("""
+                INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
+                VALUES (?, ?, ?, ?, ?, ?, ?)
+                """)) {
+            stmt.setString(1, crawlMeta.domainName());
+            stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
+            stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
+            stmt.setInt(4, crawlMeta.recrawlErrors);
+            stmt.setLong(5, crawlMeta.crawlTime.toMillis());
+            stmt.setInt(6, crawlMeta.crawlChanges);
+            stmt.setInt(7, crawlMeta.totalCrawlSize);
+            stmt.executeUpdate();
+        } catch (SQLException e) {
+            logger.error("Failed to insert crawl meta record", e);
+        }
+    }
+
    public void save(SummaryRecord record) {
+        if (connection == null) throw new IllegalStateException("No connection to domainstate db");
+
        try (var stmt = connection.prepareStatement("""
                INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
                VALUES (?, ?, ?, ?, ?)
@@ -145,7 +225,38 @@ public class DomainStateDb implements AutoCloseable {
        }
    }

-    public Optional<SummaryRecord> get(String domainName) {
+    public Optional<CrawlMeta> getMeta(String domainName) {
+        if (connection == null)
+            return Optional.empty();
+
+        try (var stmt = connection.prepareStatement("""
+                SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
+                FROM crawl_meta
+                WHERE domain = ?
+                """)) {
+            stmt.setString(1, domainName);
+            var rs = stmt.executeQuery();
+            if (rs.next()) {
+                return Optional.of(new CrawlMeta(
+                        rs.getString("domain"),
+                        Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
+                        Duration.ofMillis(rs.getLong("recrawlTimeMs")),
+                        Duration.ofMillis(rs.getLong("crawlTimeMs")),
+                        rs.getInt("recrawlErrors"),
+                        rs.getInt("crawlChanges"),
+                        rs.getInt("totalCrawlSize")
+                ));
+            }
+        } catch (SQLException ex) {
+            logger.error("Failed to get crawl meta record", ex);
+        }
+        return Optional.empty();
+    }
+
+    public Optional<SummaryRecord> getSummary(String domainName) {
+        if (connection == null)
+            return Optional.empty();
+
        try (var stmt = connection.prepareStatement("""
                SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
                FROM summary
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
@@ -1,6 +1,6 @@
 package nu.marginalia.crawl.fetcher;

-import java.net.http.HttpRequest;
+import org.apache.hc.client5.http.classic.methods.HttpGet;

 /** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
 public record ContentTags(String etag, String lastMod) {
@@ -17,14 +17,16 @@ public record ContentTags(String etag, String lastMod) {
    }

    /** Paints the tags onto the request builder. */
-    public void paint(HttpRequest.Builder getBuilder) {
+    public void paint(HttpGet request) {
+
+        // Paint the ETag header if present,
+        // otherwise paint the Last-Modified header
+        // (but not both at the same time due to some servers not liking it)

        if (etag != null) {
-            getBuilder.header("If-None-Match", etag);
-        }
-
-        if (lastMod != null) {
-            getBuilder.header("If-Modified-Since", lastMod);
+            request.addHeader("If-None-Match", etag);
+        } else if (lastMod != null) {
+            request.addHeader("If-Modified-Since", lastMod);
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/Cookies.java
@@ -1,34 +0,0 @@
-package nu.marginalia.crawl.fetcher;
-
-import java.io.IOException;
-import java.net.CookieHandler;
-import java.net.URI;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-
-public class Cookies extends CookieHandler {
-    final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
-
-    public void clear() {
-        cookieJar.get().clear();
-    }
-
-    public boolean hasCookies() {
-        return !cookieJar.get().isEmpty();
-    }
-
-    public List<String> getCookies() {
-        return cookieJar.get().values().stream().flatMap(List::stream).toList();
-    }
-
-    @Override
-    public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
-        return cookieJar.get();
-    }
-
-    @Override
-    public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
-        cookieJar.get().putAll(responseHeaders);
-    }
-}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/DomainCookies.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/DomainCookies.java
@@ -0,0 +1,56 @@
+package nu.marginalia.crawl.fetcher;
+
+import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
+import org.apache.hc.core5.http.ClassicHttpRequest;
+import org.apache.hc.core5.http.HttpResponse;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.StringJoiner;
+
+public class DomainCookies {
+    private final Map<String, String> cookies = new HashMap<>();
+
+    public boolean hasCookies() {
+        return !cookies.isEmpty();
+    }
+
+    public void updateCookieStore(HttpResponse response) {
+        for (var header : response.getHeaders()) {
+            if (header.getName().equalsIgnoreCase("Set-Cookie")) {
+                parseCookieHeader(header.getValue());
+            }
+        }
+    }
+
+    private void parseCookieHeader(String value) {
+        // Parse the Set-Cookie header value and extract the cookies
+
+        String[] parts = value.split(";");
+        String cookie = parts[0].trim();
+
+        if (cookie.contains("=")) {
+            String[] cookieParts = cookie.split("=");
+            String name = cookieParts[0].trim();
+            String val = cookieParts[1].trim();
+            cookies.put(name, val);
+        }
+    }
+
+    public void paintRequest(HttpUriRequestBase request) {
+        request.addHeader("Cookie", createCookieHeader());
+    }
+
+    public void paintRequest(ClassicHttpRequest request) {
+        request.addHeader("Cookie", createCookieHeader());
+    }
+
+    private String createCookieHeader() {
+        StringJoiner sj = new StringJoiner("; ");
+        for (var cookie : cookies.entrySet()) {
+            sj.add(cookie.getKey() + "=" + cookie.getValue());
+        }
+        return sj.toString();
+    }
+
+}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcher.java
@@ -8,6 +8,7 @@ import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
 import nu.marginalia.model.crawldata.CrawlerDomainStatus;
+import org.apache.hc.client5.http.cookie.CookieStore;

 import java.util.List;

@@ -15,20 +16,17 @@ import java.util.List;
 public interface HttpFetcher extends AutoCloseable {
    void setAllowAllContentTypes(boolean allowAllContentTypes);

-    Cookies getCookies();
+    CookieStore getCookies();
    void clearCookies();

    DomainProbeResult probeDomain(EdgeUrl url);

-    ContentTypeProbeResult probeContentType(
-                                EdgeUrl url,
-                                WarcRecorder recorder,
-                                ContentTags tags) throws HttpFetcherImpl.RateLimitException;
-
    HttpFetchResult fetchContent(EdgeUrl url,
                                 WarcRecorder recorder,
+                                 DomainCookies cookies,
+                                 CrawlDelayTimer timer,
                                 ContentTags tags,
-                                 ProbeType probeType) throws Exception;
+                                 ProbeType probeType);

    List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);

@@ -46,6 +44,7 @@ public interface HttpFetcher extends AutoCloseable {

        /** This domain redirects to another domain */
        record Redirect(EdgeDomain domain) implements DomainProbeResult {}
+        record RedirectSameDomain_Internal(EdgeUrl domain) implements DomainProbeResult {}

        /** If the retrieval of the probed url was successful, return the url as it was fetched
         * (which may be different from the url we probed, if we attempted another URL schema).
@@ -56,7 +55,10 @@ public interface HttpFetcher extends AutoCloseable {
    }

    sealed interface ContentTypeProbeResult {
+        record NoOp() implements ContentTypeProbeResult {}
        record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
+        record HttpError(int statusCode, String message) implements ContentTypeProbeResult { }
+        record Redirect(EdgeUrl location) implements ContentTypeProbeResult { }
        record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
        record Timeout(java.lang.Exception ex) implements ContentTypeProbeResult { }
        record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -5,67 +5,169 @@ import com.google.inject.Singleton;
 import crawlercommons.robots.SimpleRobotRules;
 import crawlercommons.robots.SimpleRobotRulesParser;
 import nu.marginalia.UserAgent;
-import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
+import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.ContentTypeLogic;
 import nu.marginalia.model.body.DocumentBodyExtractor;
 import nu.marginalia.model.body.HttpFetchResult;
 import nu.marginalia.model.crawldata.CrawlerDomainStatus;
+import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
+import org.apache.hc.client5.http.HttpRequestRetryStrategy;
+import org.apache.hc.client5.http.classic.HttpClient;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.client5.http.config.ConnectionConfig;
+import org.apache.hc.client5.http.config.RequestConfig;
+import org.apache.hc.client5.http.cookie.BasicCookieStore;
+import org.apache.hc.client5.http.cookie.CookieStore;
+import org.apache.hc.client5.http.cookie.StandardCookieSpec;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
+import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
+import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy;
+import org.apache.hc.core5.http.*;
+import org.apache.hc.core5.http.io.HttpClientResponseHandler;
+import org.apache.hc.core5.http.io.SocketConfig;
+import org.apache.hc.core5.http.io.entity.EntityUtils;
+import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
+import org.apache.hc.core5.http.message.MessageSupport;
+import org.apache.hc.core5.http.protocol.HttpContext;
+import org.apache.hc.core5.pool.PoolStats;
+import org.apache.hc.core5.util.TimeValue;
+import org.apache.hc.core5.util.Timeout;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.parser.Parser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.slf4j.Marker;
+import org.slf4j.MarkerFactory;

+import javax.net.ssl.SSLContext;
+import javax.net.ssl.SSLException;
 import java.io.IOException;
-import java.io.InputStream;
+import java.net.SocketTimeoutException;
 import java.net.URISyntaxException;
-import java.net.http.HttpClient;
-import java.net.http.HttpRequest;
-import java.net.http.HttpResponse;
-import java.net.http.HttpTimeoutException;
+import java.net.UnknownHostException;
+import java.security.NoSuchAlgorithmException;
 import java.time.Duration;
+import java.time.Instant;
 import java.util.*;
-import java.util.concurrent.Executors;
-import java.util.zip.GZIPInputStream;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;


@Singleton
-public class HttpFetcherImpl implements HttpFetcher {
+public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {

    private final Logger logger = LoggerFactory.getLogger(getClass());
    private final String userAgentString;
    private final String userAgentIdentifier;
-    private final Cookies cookies = new Cookies();
+
+    private final CookieStore cookies = new BasicCookieStore();

    private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
    private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
+    private final Marker crawlerAuditMarker = MarkerFactory.getMarker("CRAWLER");

-    private final Duration requestTimeout = Duration.ofSeconds(10);
-    private final Duration probeTimeout = Duration.ofSeconds(30);
-
+    private final LinkParser linkParser = new LinkParser();
    @Override
    public void setAllowAllContentTypes(boolean allowAllContentTypes) {
        contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
    }

-    private final HttpClient client;
+    private final CloseableHttpClient client;
+    private PoolingHttpClientConnectionManager connectionManager;

-    private HttpClient createClient() {
-        return HttpClient.newBuilder()
-                .sslContext(NoSecuritySSL.buildSslContext())
-                .cookieHandler(cookies)
-                .followRedirects(HttpClient.Redirect.NORMAL)
-                .connectTimeout(Duration.ofSeconds(8))
-                .executor(Executors.newCachedThreadPool())
+    public PoolStats getPoolStats() {
+        return connectionManager.getTotalStats();
+    }
+
+    private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
+        final ConnectionConfig connectionConfig = ConnectionConfig.custom()
+                .setSocketTimeout(10, TimeUnit.SECONDS)
+                .setConnectTimeout(30, TimeUnit.SECONDS)
+                .setValidateAfterInactivity(TimeValue.ofSeconds(5))
+                .build();
+
+        connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
+                .setMaxConnPerRoute(2)
+                .setMaxConnTotal(5000)
+                .setDefaultConnectionConfig(connectionConfig)
+                .setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
+                .build();
+
+        connectionManager.setDefaultSocketConfig(SocketConfig.custom()
+                .setSoLinger(TimeValue.ofSeconds(-1))
+                .setSoTimeout(Timeout.ofSeconds(10))
+                .build()
+        );
+
+        Thread.ofPlatform().daemon(true).start(() -> {
+            try {
+                for (;;) {
+                    TimeUnit.SECONDS.sleep(15);
+                    logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
+                }
+            }
+            catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        });
+
+        final RequestConfig defaultRequestConfig = RequestConfig.custom()
+                .setCookieSpec(StandardCookieSpec.RELAXED)
+                .setResponseTimeout(10, TimeUnit.SECONDS)
+                .setConnectionRequestTimeout(5, TimeUnit.MINUTES)
+                .build();
+
+        return HttpClients.custom()
+                .setDefaultCookieStore(cookies)
+                .setConnectionManager(connectionManager)
+                .setRetryStrategy(this)
+                .setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
+                    // Default keep-alive duration is 3 minutes, but this is too long for us,
+                    // as we are either going to re-use it fairly quickly or close it for a long time.
+                    //
+                    // So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
+                    private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
+
+                    @Override
+                    public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
+                        final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
+
+                        while (it.hasNext()) {
+                            final HeaderElement he = it.next();
+                            final String param = he.getName();
+                            final String value = he.getValue();
+
+                            if (value == null)
+                                continue;
+                            if (!"timeout".equalsIgnoreCase(param))
+                                continue;
+
+                            try {
+                                long timeout = Long.parseLong(value);
+                                timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
+                                return TimeValue.ofSeconds(timeout);
+                            } catch (final NumberFormatException ignore) {
+                                break;
+                            }
+                        }
+                        return defaultValue;
+                    }
+                })
+                .disableRedirectHandling()
+                .setDefaultRequestConfig(defaultRequestConfig)
                .build();
    }

    @Override
-    public Cookies getCookies() {
+    public CookieStore getCookies() {
        return cookies;
    }

@@ -77,19 +179,27 @@ public class HttpFetcherImpl implements HttpFetcher {
    @Inject
    public HttpFetcherImpl(UserAgent userAgent)
    {
-        this.client = createClient();
+        try {
+            this.client = createClient();
+        } catch (NoSuchAlgorithmException e) {
+            throw new RuntimeException(e);
+        }
        this.userAgentString = userAgent.uaString();
        this.userAgentIdentifier = userAgent.uaIdentifier();
    }

    public HttpFetcherImpl(String userAgent) {
-        this.client = createClient();
+        try {
+            this.client = createClient();
+        } catch (NoSuchAlgorithmException e) {
+            throw new RuntimeException(e);
+        }
        this.userAgentString = userAgent;
        this.userAgentIdentifier = userAgent;
    }

    // Not necessary in prod, but useful in test
-    public void close() {
+    public void close() throws IOException {
        client.close();
    }

@@ -102,34 +212,94 @@ public class HttpFetcherImpl implements HttpFetcher {
     */
    @Override
    public DomainProbeResult probeDomain(EdgeUrl url) {
-        HttpRequest head;
-        try {
-            head = HttpRequest.newBuilder()
-                    .HEAD()
-                    .uri(url.asURI())
-                    .header("User-agent", userAgentString)
-                    .timeout(probeTimeout)
-                    .build();
-        } catch (URISyntaxException e) {
-            return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
-        }
+        List<EdgeUrl> urls = new ArrayList<>();
+        urls.add(url);

-        for (int tries = 0;; tries++) {
+        int redirects = 0;
+        AtomicBoolean tryGet = new AtomicBoolean(false);
+
+        while (!urls.isEmpty() && ++redirects < 5) {
+            ClassicHttpRequest request;
+
+            EdgeUrl topUrl = urls.removeFirst();
            try {
-                var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
-                EdgeUrl rspUri = new EdgeUrl(rsp.uri());
-
-                if (!Objects.equals(rspUri.domain, url.domain)) {
-                    return new DomainProbeResult.Redirect(rspUri.domain);
+                if (tryGet.get()) {
+                    request = ClassicRequestBuilder.get(topUrl.asURI())
+                                .addHeader("User-Agent", userAgentString)
+                                .addHeader("Accept-Encoding", "gzip")
+                                .addHeader("Range", "bytes=0-255")
+                                .build();
+                } else {
+                    request = ClassicRequestBuilder.head(topUrl.asURI())
+                                .addHeader("User-Agent", userAgentString)
+                                .addHeader("Accept-Encoding", "gzip")
+                                .build();
                }
-                return new DomainProbeResult.Ok(rspUri);
-            } catch (Exception ex) {
-                if (tries > 3) {
-                    return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
-                }
-                // else try again ...
+            } catch (URISyntaxException e) {
+                return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
            }
+
+            try {
+                var result = SendLock.wrapSend(client, request, response -> {
+                    EntityUtils.consume(response.getEntity());
+
+                    return switch (response.getCode()) {
+                        case 200 -> new DomainProbeResult.Ok(url);
+                        case 405 -> {
+                            if (!tryGet.get()) {
+                                tryGet.set(true);
+                                yield new DomainProbeResult.RedirectSameDomain_Internal(url);
+                            }
+                            else {
+                                yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status 405, tried HEAD and GET?!");
+                            }
+                        }
+                        case 301, 302, 307 -> {
+                            var location = response.getFirstHeader("Location");
+
+                            if (location != null) {
+                                Optional<EdgeUrl> newUrl = linkParser.parseLink(topUrl, location.getValue());
+                                if (newUrl.isEmpty()) {
+                                    yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid location header on redirect");
+                                }
+                                EdgeUrl newEdgeUrl = newUrl.get();
+                                if (newEdgeUrl.domain.equals(topUrl.domain)) {
+                                    yield new DomainProbeResult.RedirectSameDomain_Internal(newEdgeUrl);
+                                }
+                                else {
+                                    yield new DomainProbeResult.Redirect(newEdgeUrl.domain);
+                                }
+                            }
+
+                            yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "No location header on redirect");
+
+                        }
+                        default ->
+                                new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status " + response.getCode());
+                    };
+                });
+
+                if (result instanceof DomainProbeResult.RedirectSameDomain_Internal(EdgeUrl redirUrl)) {
+                    urls.add(redirUrl);
+                }
+                else {
+                    return result;
+                }
+
+                // We don't have robots.txt yet, so we'll assume a request delay of 1 second
+                TimeUnit.SECONDS.sleep(1);
+            }
+            catch (SocketTimeoutException ex) {
+                return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe");
+            }
+            catch (Exception ex) {
+                return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe");
+            }
+
        }
+
+        return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Failed to resolve domain root");
+
    }

    /** Perform a HEAD request to fetch the content type of a URL.
@@ -140,70 +310,73 @@ public class HttpFetcherImpl implements HttpFetcher {
     * recorded in the WARC file on failure.
     */
    public ContentTypeProbeResult probeContentType(EdgeUrl url,
-                                                   WarcRecorder warcRecorder,
-                                                   ContentTags tags) throws RateLimitException {
-        if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
-
-            try {
-                var headBuilder = HttpRequest.newBuilder()
-                    .HEAD()
-                    .uri(url.asURI())
-                    .header("User-Agent", userAgentString)
-                    .header("Accept-Encoding", "gzip")
-                    .timeout(requestTimeout)
-                    ;
-
-                var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
-                var headers = rsp.headers();
-
-                var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
-
-                if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
-                    warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
-
-                    return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
-                }
-
-                // Update the URL to the final URL of the HEAD request, otherwise we might end up doing
-
-                // HEAD 301 url1 -> url2
-                // HEAD 200 url2
-                // GET 301 url1 -> url2
-                // GET 200 url2
-
-                // which is not what we want. Overall we want to do as few requests as possible to not raise
-                // too many eyebrows when looking at the logs on the target server.  Overall it's probably desirable
-                // that it looks like the traffic makes sense, as opposed to looking like a broken bot.
-
-                var redirectUrl = new EdgeUrl(rsp.uri());
-                EdgeUrl ret;
-
-                if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
-                else ret = url;
-
-                // Intercept rate limiting
-                if (rsp.statusCode() == 429) {
-                    throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
-                }
-
-                return new ContentTypeProbeResult.Ok(ret);
-            }
-            catch (HttpTimeoutException ex) {
-                warcRecorder.flagAsTimeout(url);
-                return new ContentTypeProbeResult.Timeout(ex);
-            }
-            catch (RateLimitException ex) {
-                throw ex;
-            }
-            catch (Exception ex) {
-                logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
-
-                warcRecorder.flagAsError(url, ex);
-
-                return new ContentTypeProbeResult.Exception(ex);
-            }
+                                                   DomainCookies cookies,
+                                                   CrawlDelayTimer timer,
+                                                   ContentTags tags) {
+        if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
+            return new ContentTypeProbeResult.NoOp();
+        }
+
+        try {
+            ClassicHttpRequest head = ClassicRequestBuilder.head(url.asURI())
+                    .addHeader("User-Agent", userAgentString)
+                    .addHeader("Accept-Encoding", "gzip")
+                    .build();
+
+            cookies.paintRequest(head);
+
+            return SendLock.wrapSend(client, head, (rsp) -> {
+                cookies.updateCookieStore(rsp);
+                EntityUtils.consume(rsp.getEntity());
+                int statusCode = rsp.getCode();
+
+                // Handle redirects
+                if (statusCode == 301 || statusCode == 302 || statusCode == 307) {
+                    var location = rsp.getFirstHeader("Location");
+                    if (location != null) {
+                        Optional<EdgeUrl> newUrl = linkParser.parseLink(url, location.getValue());
+                        if (newUrl.isEmpty())
+                            return new ContentTypeProbeResult.HttpError(statusCode, "Invalid location header on redirect");
+                        return new ContentTypeProbeResult.Redirect(newUrl.get());
+                    }
+                }
+
+                if (statusCode == 405) {
+                    // If we get a 405, we can't probe the content type with HEAD, so we'll just say it's ok
+                    return new ContentTypeProbeResult.Ok(url);
+                }
+
+                // Handle errors
+                if (statusCode < 200 || statusCode > 300) {
+                    return new ContentTypeProbeResult.HttpError(statusCode, "Bad status code");
+                }
+
+                // Handle missing content type
+                var ctHeader = rsp.getFirstHeader("Content-Type");
+                if (ctHeader == null) {
+                    return new ContentTypeProbeResult.HttpError(statusCode, "Missing Content-Type header");
+                }
+                var contentType = ctHeader.getValue();
+
+                // Check if the content type is allowed
+                if (contentTypeLogic.isAllowableContentType(contentType)) {
+                    return new ContentTypeProbeResult.Ok(url);
+                } else {
+                    return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
+                }
+            });
+        }
+        catch (SocketTimeoutException ex) {
+
+            return new ContentTypeProbeResult.Timeout(ex);
+        }
+        catch (Exception ex) {
+            logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
+            return new ContentTypeProbeResult.Exception(ex);
+        }
+        finally {
+            timer.waitFetchDelay();
        }
-        return new ContentTypeProbeResult.Ok(url);
    }

    /** Fetch the content of a URL, and record it in a WARC file,
@@ -213,37 +386,87 @@ public class HttpFetcherImpl implements HttpFetcher {
    @Override
    public HttpFetchResult fetchContent(EdgeUrl url,
                                           WarcRecorder warcRecorder,
+                                           DomainCookies cookies,
+                                           CrawlDelayTimer timer,
                                           ContentTags contentTags,
                                           ProbeType probeType)
-        throws Exception
    {
-        var getBuilder = HttpRequest.newBuilder()
-                .GET()
-                .uri(url.asURI())
-                .header("User-Agent", userAgentString)
-                .header("Accept-Encoding", "gzip")
-                .header("Accept-Language", "en,*;q=0.5")
-                .header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
-                .timeout(requestTimeout)
-                ;
+        try {
+            if (probeType == HttpFetcher.ProbeType.FULL) {
+                try {
+                    var probeResult = probeContentType(url, cookies, timer, contentTags);

-        contentTags.paint(getBuilder);
+                    switch (probeResult) {
+                        case HttpFetcher.ContentTypeProbeResult.NoOp():
+                            break; //
+                        case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
+                            logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
+                            url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
+                            break;
+                        case ContentTypeProbeResult.BadContentType badContentType:
+                            warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
+                            logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
+                            return new HttpFetchResult.ResultNone();
+                        case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
+                            logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
+                            warcRecorder.flagAsTimeout(url);
+                            return new HttpFetchResult.ResultException(ex);
+                        case ContentTypeProbeResult.Exception(Exception ex):
+                            logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
+                            warcRecorder.flagAsError(url, ex);
+                            return new HttpFetchResult.ResultException(ex);
+                        case ContentTypeProbeResult.HttpError httpError:
+                            logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
+                            return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
+                        case ContentTypeProbeResult.Redirect redirect:
+                            logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
+                            return new HttpFetchResult.ResultRedirect(redirect.location());
+                    }
+                } catch (Exception ex) {
+                    logger.warn("Failed to fetch {}", url, ex);
+                    return new HttpFetchResult.ResultException(ex);
+                }

-        HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
-
-        if (result instanceof HttpFetchResult.ResultOk ok) {
-            if (ok.statusCode() == 429) {
-                throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
            }
-            if (ok.statusCode() == 304) {
-                return new HttpFetchResult.Result304Raw();
-            }
-            if (ok.statusCode() == 200) {
-                return ok;
+
+            HttpGet request = new HttpGet(url.asURI());
+            request.addHeader("User-Agent", userAgentString);
+            request.addHeader("Accept-Encoding", "gzip");
+            request.addHeader("Accept-Language", "en,*;q=0.5");
+            request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
+
+            contentTags.paint(request);
+
+            try (var sl = new SendLock()) {
+                Instant start = Instant.now();
+                HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
+
+                Duration fetchDuration = Duration.between(start, Instant.now());
+
+                if (result instanceof HttpFetchResult.ResultOk ok) {
+                    if (ok.statusCode() == 304) {
+                        result = new HttpFetchResult.Result304Raw();
+                    }
+                }
+
+                switch (result) {
+                    case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
+                    case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {}  for {}", redirect.url(), url);
+                    case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
+                    case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
+                    case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
+                    case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
+                }
+
+                return result;
            }
        }
+        catch (Exception ex) {
+            logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
+
+            return new HttpFetchResult.ResultException(ex);
+        }

-        return result;
    }

    @Override
@@ -307,62 +530,66 @@ public class HttpFetcherImpl implements HttpFetcher {
    }


-    private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
-        HttpRequest getRequest = HttpRequest.newBuilder()
-                .GET()
-                .uri(sitemapUrl.asURI())
-                .header("Accept-Encoding", "gzip")
-                .header("Accept", "text/*, */*;q=0.9")
-                .header("User-Agent", userAgentString)
-                .timeout(requestTimeout)
-                .build();
+    private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
+        HttpGet getRequest = new HttpGet(sitemapUrl.asURI());

-        var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
-        if (response.statusCode() != 200) {
-            return new SitemapResult.SitemapError();
+        getRequest.addHeader("User-Agent", userAgentString);
+        getRequest.addHeader("Accept-Encoding", "gzip");
+        getRequest.addHeader("Accept", "text/*, */*;q=0.9");
+        getRequest.addHeader("User-Agent", userAgentString);
+
+        try (var sl = new SendLock()) {
+            return client.execute(getRequest, response -> {
+                try {
+                    if (response.getCode() != 200) {
+                        return new SitemapResult.SitemapError();
+                    }
+
+                    Document parsedSitemap = Jsoup.parse(
+                            EntityUtils.toString(response.getEntity()),
+                            sitemapUrl.toString(),
+                            Parser.xmlParser()
+                    );
+
+                    if (parsedSitemap.childrenSize() == 0) {
+                        return new SitemapResult.SitemapError();
+                    }
+
+                    String rootTagName = parsedSitemap.child(0).tagName();
+
+                    return switch (rootTagName.toLowerCase()) {
+                        case "sitemapindex" -> {
+                            List<String> references = new ArrayList<>();
+                            for (var locTag : parsedSitemap.getElementsByTag("loc")) {
+                                references.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
+                        }
+                        case "urlset" -> {
+                            List<String> urls = new ArrayList<>();
+                            for (var locTag : parsedSitemap.select("url > loc")) {
+                                urls.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
+                        }
+                        case "rss", "atom" -> {
+                            List<String> urls = new ArrayList<>();
+                            for (var locTag : parsedSitemap.select("link, url")) {
+                                urls.add(locTag.text().trim());
+                            }
+                            yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
+                        }
+                        default -> new SitemapResult.SitemapError();
+                    };
+                }
+                finally {
+                    EntityUtils.consume(response.getEntity());
+                }
+            });
        }
-
-        try (InputStream inputStream = response.body()) {
-
-            InputStream parserStream;
-            if (sitemapUrl.path.endsWith(".gz")) {
-                parserStream = new GZIPInputStream(inputStream);
-            }
-            else {
-                parserStream = inputStream;
-            }
-
-            Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
-            if (parsedSitemap.childrenSize() == 0) {
-                return new SitemapResult.SitemapError();
-            }
-
-            String rootTagName = parsedSitemap.child(0).tagName();
-
-            return switch (rootTagName.toLowerCase()) {
-                case "sitemapindex" -> {
-                    List<String> references = new ArrayList<>();
-                    for (var locTag : parsedSitemap.getElementsByTag("loc")) {
-                        references.add(locTag.text().trim());
-                    }
-                    yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
-                }
-                case "urlset" -> {
-                    List<String> urls = new ArrayList<>();
-                    for (var locTag : parsedSitemap.select("url > loc")) {
-                        urls.add(locTag.text().trim());
-                    }
-                    yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
-                }
-                case "rss", "atom" -> {
-                    List<String> urls = new ArrayList<>();
-                    for (var locTag : parsedSitemap.select("link, url")) {
-                        urls.add(locTag.text().trim());
-                    }
-                    yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
-                }
-                default -> new SitemapResult.SitemapError();
-            };
+        catch (Exception ex) {
+            logger.warn("Error while fetching sitemap {}: {} ({})", sitemapUrl, ex.getClass().getSimpleName(), ex.getMessage());
+            return new SitemapResult.SitemapError();
        }
    }

@@ -386,16 +613,14 @@ public class HttpFetcherImpl implements HttpFetcher {
    }

    private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
-        try {
-            var getRequest = HttpRequest.newBuilder()
-                    .GET()
-                    .uri(url.asURI())
-                    .header("Accept-Encoding", "gzip")
-                    .header("Accept", "text/*, */*;q=0.9")
-                    .header("User-Agent", userAgentString)
-                    .timeout(requestTimeout);
+        try (var sl = new SendLock()) {

-            HttpFetchResult result = recorder.fetch(client, getRequest.build());
+            HttpGet request = new HttpGet(url.asURI());
+            request.addHeader("User-Agent", userAgentString);
+            request.addHeader("Accept-Encoding", "gzip");
+            request.addHeader("Accept", "text/*, */*;q=0.9");
+
+            HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);

            return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
                robotsParser.parseContent(url.toString(),
@@ -409,6 +634,57 @@ public class HttpFetcherImpl implements HttpFetcher {
        }
    }

+    @Override
+    public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
+        return switch (exception) {
+            case SocketTimeoutException ste -> false;
+            case SSLException ssle -> false;
+            case UnknownHostException uhe -> false;
+            default -> executionCount <= 3;
+        };
+    }
+
+    @Override
+    public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
+        return switch (response.getCode()) {
+            case 500, 503 -> executionCount <= 2;
+            case 429 -> executionCount <= 3;
+            default -> false;
+        };
+    }
+
+    @Override
+    public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
+        return TimeValue.ofSeconds(1);
+    }
+
+    @Override
+    public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
+
+        int statusCode = response.getCode();
+
+        // Give 503 a bit more time
+        if (statusCode == 503) return TimeValue.ofSeconds(5);
+
+        if (statusCode == 429) {
+            // get the Retry-After header
+            String retryAfter = response.getFirstHeader("Retry-After").getValue();
+            if (retryAfter == null) {
+                return TimeValue.ofSeconds(2);
+            }
+
+            try {
+                int retryAfterTime = Integer.parseInt(retryAfter);
+                retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
+
+                return TimeValue.ofSeconds(retryAfterTime);
+            } catch (NumberFormatException e) {
+                logger.warn("Invalid Retry-After header: {}", retryAfter);
+            }
+        }
+
+        return TimeValue.ofSeconds(2);
+    }

    public static class RateLimitException extends Exception {
        private final String retryAfter;
@@ -429,5 +705,31 @@ public class HttpFetcherImpl implements HttpFetcher {
            }
        }
    }
+
+}
+
+class SendLock implements AutoCloseable {
+
+    private static final Semaphore maxConcurrentRequests = new Semaphore(Integer.getInteger("crawler.maxConcurrentRequests", 512));
+    boolean closed = false;
+
+    public SendLock() {
+        maxConcurrentRequests.acquireUninterruptibly();
+    }
+
+    public static <T> T wrapSend(HttpClient client, final ClassicHttpRequest request,
+                                               final HttpClientResponseHandler<? extends T> responseHandler) throws IOException {
+        try (var lock = new SendLock()) {
+            return client.execute(request, responseHandler);
+        }
+    }
+
+    @Override
+    public void close() {
+        if (!closed) {
+            maxConcurrentRequests.release();
+            closed = true;
+        }
+    }
 }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcInputBuffer.java
@@ -1,15 +1,20 @@
 package nu.marginalia.crawl.fetcher.warc;

+import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.BOMInputStream;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.core5.http.ClassicHttpResponse;
+import org.apache.hc.core5.http.Header;
 import org.netpreserve.jwarc.WarcTruncationReason;

 import java.io.*;
-import java.net.http.HttpHeaders;
-import java.net.http.HttpResponse;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.Map;
-import java.util.zip.GZIPInputStream;
+import java.time.Duration;
+import java.time.Instant;
+import java.util.Arrays;
+
+import static nu.marginalia.crawl.fetcher.warc.ErrorBuffer.suppressContentEncoding;

 /** Input buffer for temporary storage of a HTTP response
 *  This may be in-memory or on-disk, at the discretion of
@@ -17,9 +22,9 @@ import java.util.zip.GZIPInputStream;
 * */
 public abstract class WarcInputBuffer implements AutoCloseable {
    protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
-    protected HttpHeaders headers;
+    protected Header[] headers;

-    WarcInputBuffer(HttpHeaders headers) {
+    WarcInputBuffer(Header[] headers) {
        this.headers = headers;
    }

@@ -31,7 +36,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {

    public final WarcTruncationReason truncationReason() { return truncationReason; }

-    public final HttpHeaders headers() { return headers; }
+    public final Header[] headers() { return headers; }

    /** Create a buffer for a response.
     *  If the response is small and not compressed, it will be stored in memory.
@@ -39,34 +44,70 @@ public abstract class WarcInputBuffer implements AutoCloseable {
     *  and suppressed from the headers.
     *  If an error occurs, a buffer will be created with no content and an error status.
     */
-    static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
-        if (rsp == null)
+    static WarcInputBuffer forResponse(ClassicHttpResponse response,
+                                       HttpGet request,
+                                       Duration timeLimit) throws IOException {
+        if (response == null)
            return new ErrorBuffer();

-        var headers = rsp.headers();

-        try (var is = rsp.body()) {
-            int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
-            String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
+        var entity = response.getEntity();

-            if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
+        if (null == entity) {
+            return new ErrorBuffer();
+        }
+
+        Instant start = Instant.now();
+        InputStream is = null;
+        try {
+            is = entity.getContent();
+            long length = entity.getContentLength();
+
+            if (length > 0 && length < 8192) {
                // If the content is small and not compressed, we can just read it into memory
-                return new MemoryBuffer(headers, is, contentLength);
-            }
-            else {
+                return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
+            } else {
                // Otherwise, we unpack it into a file and read it from there
-                return new FileBuffer(headers, is);
+                return new FileBuffer(response.getHeaders(), request, timeLimit, is);
            }
        }
-        catch (Exception ex) {
-            return new ErrorBuffer();
+        finally {
+            // We're required to consume the stream to avoid leaking connections,
+            // but we also don't want to get stuck on slow or malicious connections
+            // forever, so we set a time limit on this phase and call abort() if it's exceeded.
+            try {
+                while (is != null) {
+                    // Consume some data
+                    if (is.skip(65536) == 0) {
+                        // Note that skip may return 0 if the stream is empty
+                        // or for other unspecified reasons, so we need to check
+                        // with read() as well to determine if the stream is done
+                        if (is.read() == -1)
+                            is = null;
+                    }
+                    // Check if the time limit has been exceeded
+                    else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
+                        request.abort();
+                        is = null;
+                    }
+                }
+            }
+            catch (IOException e) {
+                // Ignore the exception
+            }
+            finally {
+                // Close the input stream
+                IOUtils.closeQuietly(is);
+            }
        }

+
    }

    /** Copy an input stream to an output stream, with a maximum size and time limit */
-    protected void copy(InputStream is, OutputStream os) {
-        long startTime = System.currentTimeMillis();
+    protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
+        Instant start = Instant.now();
+        Instant timeout = start.plus(timeLimit);
        long size = 0;

        byte[] buffer = new byte[8192];
@@ -76,24 +117,105 @@ public abstract class WarcInputBuffer implements AutoCloseable {

        while (true) {
            try {
+                Duration remaining = Duration.between(Instant.now(), timeout);
+                if (remaining.isNegative()) {
+                    truncationReason = WarcTruncationReason.TIME;
+                    // Abort the request if the time limit is exceeded
+                    // so we don't keep the connection open forever or are forced to consume
+                    // the stream to the end
+
+                    request.abort();
+                    break;
+                }
+
                int n = is.read(buffer);
+
                if (n < 0) break;
                size += n;
-                os.write(buffer, 0, n);

-                if (size > WarcRecorder.MAX_SIZE) {
+                // Even if we've exceeded the max length,
+                // we keep consuming the stream up until the end or a timeout,
+                // as closing the stream means resetting the connection, and
+                // that's generally not desirable.
+
+                if (size < WarcRecorder.MAX_SIZE) {
+                    os.write(buffer, 0, n);
+                }
+                else if (truncationReason != WarcTruncationReason.LENGTH) {
                    truncationReason = WarcTruncationReason.LENGTH;
                    break;
                }

-                if (System.currentTimeMillis() - startTime > WarcRecorder.MAX_TIME) {
-                    truncationReason = WarcTruncationReason.TIME;
-                    break;
-                }
            } catch (IOException e) {
-                throw new RuntimeException(e);
+                truncationReason = WarcTruncationReason.UNSPECIFIED;
            }
        }
+
+    }
+
+    /** Takes a Content-Range header and checks if it is complete.
+     *  A complete range is one that covers the entire resource.
+     *  For example, "bytes 0-1023/2048" or "bytes 0-1023/*" are complete ranges.
+     *  "bytes 0-1023/2048" is not a complete range.
+     */
+    public boolean isRangeComplete(Header[] headers) {
+        // Find the Content-Range header
+        String contentRangeHeader = null;
+        for (var header : headers) {
+            if ("Content-Range".equalsIgnoreCase(header.getName())) {
+                contentRangeHeader = header.getValue();
+                break;
+            }
+        }
+
+        // Return true if header is null or empty
+        if (contentRangeHeader == null || contentRangeHeader.isEmpty()) {
+            return true;
+        }
+
+        try {
+            // Content-Range format: "bytes range-start-range-end/size"
+            // e.g., "bytes 0-1023/2048" or "bytes 0-1023/*"
+
+            // Get the part after "bytes "
+            String[] parts = contentRangeHeader.split(" ", 2);
+            if (parts.length < 2) {
+                return false;
+            }
+
+            // Get the range and size parts (e.g., "0-1023/2048")
+            String rangeAndSize = parts[1];
+            String[] rangeAndSizeParts = rangeAndSize.split("/", 2);
+            if (rangeAndSizeParts.length < 2) {
+                return false;
+            }
+
+            // Get the range (e.g., "0-1023")
+            String range = rangeAndSizeParts[0];
+            String[] rangeParts = range.split("-", 2);
+            if (rangeParts.length < 2) {
+                return false;
+            }
+
+            // Get the size (e.g., "2048" or "*")
+            String size = rangeAndSizeParts[1];
+
+            // If size is "*", we don't know the total size, so return false
+            if ("*".equals(size)) {
+                return false;
+            }
+
+            // Parse as long to handle large files
+            long rangeStart = Long.parseLong(rangeParts[0]);
+            long rangeEnd = Long.parseLong(rangeParts[1]);
+            long totalSize = Long.parseLong(size);
+
+            // Check if the range covers the entire resource
+            return rangeStart == 0 && rangeEnd == totalSize - 1;
+
+        } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
+            return false;
+        }
    }

 }
@@ -101,7 +223,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
 /** Pseudo-buffer for when we have an error */
 class ErrorBuffer extends WarcInputBuffer {
    public ErrorBuffer() {
-        super(HttpHeaders.of(Map.of(), (k,v)->false));
+        super(new Header[0]);

        truncationReason = WarcTruncationReason.UNSPECIFIED;
    }
@@ -118,17 +240,29 @@ class ErrorBuffer extends WarcInputBuffer {

    @Override
    public void close() throws Exception {}
+
+
+    static Header[] suppressContentEncoding(Header[] headers) {
+        return Arrays.stream(headers).filter(header -> !"Content-Encoding".equalsIgnoreCase(header.getName())).toArray(Header[]::new);
+    }
+
 }

 /** Buffer for when we have the response in memory */
 class MemoryBuffer extends WarcInputBuffer {
    byte[] data;
-    public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
-        super(headers);
+    public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
+        super(suppressContentEncoding(headers));
+
+        if (!isRangeComplete(headers)) {
+            truncationReason = WarcTruncationReason.LENGTH;
+        } else {
+            truncationReason = WarcTruncationReason.NOT_TRUNCATED;
+        }

        var outputStream = new ByteArrayOutputStream(size);

-        copy(responseStream, outputStream);
+        copy(responseStream, request, outputStream, timeLimit);

        data = outputStream.toByteArray();
    }
@@ -152,40 +286,25 @@ class MemoryBuffer extends WarcInputBuffer {
 class FileBuffer extends WarcInputBuffer {
    private final Path tempFile;

-    public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
+    public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
        super(suppressContentEncoding(headers));

+        if (!isRangeComplete(headers)) {
+            truncationReason = WarcTruncationReason.LENGTH;
+        } else {
+            truncationReason = WarcTruncationReason.NOT_TRUNCATED;
+        }
+
        this.tempFile = Files.createTempFile("rsp", ".html");

-
-        if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
-            try (var out = Files.newOutputStream(tempFile)) {
-                copy(new GZIPInputStream(responseStream), out);
-            }
-            catch (Exception ex) {
-                truncationReason = WarcTruncationReason.UNSPECIFIED;
-            }
+        try (var out = Files.newOutputStream(tempFile)) {
+            copy(responseStream, request, out, timeLimit);
        }
-        else {
-            try (var out = Files.newOutputStream(tempFile)) {
-                copy(responseStream, out);
-            }
-            catch (Exception ex) {
-                truncationReason = WarcTruncationReason.UNSPECIFIED;
-            }
+        catch (Exception ex) {
+            truncationReason = WarcTruncationReason.UNSPECIFIED;
        }
    }

-    private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
-        return HttpHeaders.of(headers.map(), (k, v) -> {
-            if ("Content-Encoding".equalsIgnoreCase(k)) {
-                return false;
-            }
-            return !"Transfer-Encoding".equalsIgnoreCase(k);
-        });
-    }
-
-
    public InputStream read() throws IOException {
        return Files.newInputStream(tempFile);
    }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcProtocolReconstructor.java
@@ -1,6 +1,8 @@
 package nu.marginalia.crawl.fetcher.warc;

 import org.apache.commons.lang3.StringUtils;
+import org.apache.hc.core5.http.ClassicHttpResponse;
+import org.apache.hc.core5.http.Header;

 import java.net.URI;
 import java.net.URLEncoder;
@@ -17,7 +19,7 @@ import java.util.stream.Collectors;
 public class WarcProtocolReconstructor {

    static String getHttpRequestString(String method,
-                                       Map<String, List<String>> mainHeaders,
+                                       Header[] mainHeaders,
                                       Map<String, List<String>> extraHeaders,
                                       URI uri) {
        StringBuilder requestStringBuilder = new StringBuilder();
@@ -34,12 +36,13 @@ public class WarcProtocolReconstructor {

        Set<String> addedHeaders = new HashSet<>();

-        mainHeaders.forEach((k, values) -> {
-            for (var value : values) {
-                addedHeaders.add(k);
-                requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
-            }
-        });
+        for (var header : mainHeaders) {
+            String k = header.getName();
+            String v = header.getValue();
+
+            addedHeaders.add(k);
+            requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(v).append("\r\n");
+        }

        extraHeaders.forEach((k, values) -> {
            if (!addedHeaders.contains(k)) {
@@ -87,6 +90,12 @@ public class WarcProtocolReconstructor {
        return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
    }

+    static String getResponseHeader(ClassicHttpResponse response, long size) {
+        String headerString = getHeadersAsString(response.getHeaders(), size);
+
+        return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
+    }
+
    private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
            Map.entry(200, "OK"),
            Map.entry(201, "Created"),
@@ -149,6 +158,37 @@ public class WarcProtocolReconstructor {
        return joiner.toString();
    }

+
+
+    static private String getHeadersAsString(Header[] headers, long responseSize) {
+        StringJoiner joiner = new StringJoiner("\r\n");
+
+        for (var header : headers) {
+            String headerCapitalized = capitalizeHeader(header.getName());
+
+            // Omit pseudoheaders injected by the crawler itself
+            if (headerCapitalized.startsWith("X-Marginalia"))
+                continue;
+
+            // Omit Transfer-Encoding and Content-Encoding headers
+            if (headerCapitalized.equals("Transfer-Encoding"))
+                continue;
+            if (headerCapitalized.equals("Content-Encoding"))
+                continue;
+
+            // Since we're transparently decoding gzip, we need to update the Content-Length header
+            // to reflect the actual size of the response body. We'll do this at the end.
+            if (headerCapitalized.equals("Content-Length"))
+                continue;
+
+            joiner.add(headerCapitalized + ": " + header.getValue());
+        }
+
+        joiner.add("Content-Length: " + responseSize);
+
+        return joiner.toString();
+    }
+
    static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
        StringJoiner joiner = new StringJoiner("\r\n");

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@@ -1,11 +1,16 @@
 package nu.marginalia.crawl.fetcher.warc;

 import nu.marginalia.crawl.fetcher.ContentTags;
-import nu.marginalia.crawl.fetcher.Cookies;
+import nu.marginalia.crawl.fetcher.DomainCookies;
+import nu.marginalia.crawl.fetcher.HttpFetcher;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
+import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
+import org.apache.hc.client5.http.classic.HttpClient;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.core5.http.NameValuePair;
 import org.jetbrains.annotations.Nullable;
 import org.netpreserve.jwarc.*;
 import org.slf4j.Logger;
@@ -14,10 +19,9 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.InetAddress;
+import java.net.SocketTimeoutException;
 import java.net.URI;
 import java.net.URISyntaxException;
-import java.net.http.HttpClient;
-import java.net.http.HttpResponse;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -37,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
    static final int MAX_TIME = 30_000;

    /** Maximum (decompressed) size we'll save */
-    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
+    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);

    private final WarcWriter writer;
    private final Path warcFile;
@@ -48,22 +52,15 @@ public class WarcRecorder implements AutoCloseable {
    // Affix a version string in case we need to change the format in the future
    // in some way
    private final String warcRecorderVersion = "1.0";
-    private final Cookies cookies;
+    private final LinkParser linkParser = new LinkParser();
    /**
     * Create a new WarcRecorder that will write to the given file
     *
     * @param warcFile The file to write to
     */
-    public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
+    public WarcRecorder(Path warcFile) throws IOException {
        this.warcFile = warcFile;
        this.writer = new WarcWriter(warcFile);
-        this.cookies = fetcher.getCookies();
-    }
-
-    public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
-        this.warcFile = warcFile;
-        this.writer = new WarcWriter(warcFile);
-        this.cookies = cookies;
    }

    /**
@@ -73,16 +70,25 @@ public class WarcRecorder implements AutoCloseable {
    public WarcRecorder() throws IOException {
        this.warcFile = Files.createTempFile("warc", ".warc.gz");
        this.writer = new WarcWriter(this.warcFile);
-        this.cookies = new Cookies();

        temporaryFile = true;
    }

    public HttpFetchResult fetch(HttpClient client,
-                                 java.net.http.HttpRequest request)
+                                 DomainCookies cookies,
+                                 HttpGet request)
            throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
    {
-        URI requestUri = request.uri();
+        return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
+    }
+
+    public HttpFetchResult fetch(HttpClient client,
+                                 DomainCookies cookies,
+                                 HttpGet request,
+                                 Duration timeout)
+            throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
+    {
+        URI requestUri = request.getUri();

        WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
        WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
@@ -90,121 +96,151 @@ public class WarcRecorder implements AutoCloseable {
        Instant date = Instant.now();

        // Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
-        Map<String, List<String>> extraHeaders = new HashMap<>(request.headers().map());
+        Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);

-        HttpResponse<InputStream> response;
+        // Inject a range header to attempt to limit the size of the response
+        // to the maximum size we want to store, if the server supports it.
+        request.addHeader("Range", "bytes=0-"+MAX_SIZE);
+        cookies.paintRequest(request);
        try {
-            response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
-        }
-        catch (Exception ex) {
-            logger.warn("Failed to fetch URL {}:  {}", requestUri, ex.getMessage());
+            return client.execute(request,response -> {
+
+                try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
+                     InputStream inputStream = inputBuffer.read()) {
+
+                    cookies.updateCookieStore(response);
+
+                    // Build and write the request
+
+                    WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
+
+                    byte[] httpRequestString = WarcProtocolReconstructor
+                            .getHttpRequestString(
+                                    request.getMethod(),
+                                    request.getHeaders(),
+                                    extraHeaders,
+                                    requestUri)
+                            .getBytes();
+
+                    requestDigestBuilder.update(httpRequestString);
+
+                    WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
+                            .blockDigest(requestDigestBuilder.build())
+                            .date(date)
+                            .body(MediaType.HTTP_REQUEST, httpRequestString)
+                            .build();
+
+                    warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
+                    writer.write(warcRequest);
+
+
+                    if (cookies.hasCookies()) {
+                        response.addHeader("X-Has-Cookies", 1);
+                    }
+
+                    byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
+
+                    ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
+
+                    responseDataBuffer.put(responseHeaders);
+                    responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
+
+                    int dataStart = responseDataBuffer.pos();
+
+                    for (;;) {
+                        int remainingLength = responseDataBuffer.remaining();
+                        if (remainingLength == 0)
+                            break;
+
+                        int startPos = responseDataBuffer.pos();
+
+                        int n = responseDataBuffer.readFrom(inputStream, remainingLength);
+                        if (n < 0)
+                            break;
+
+                        responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
+                        responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
+                    }
+
+                    // with some http client libraries, that resolve redirects transparently, this might be different
+                    // from the request URI, but currently we don't have transparent redirect resolution so it's always
+                    // the same (though let's keep the variables separate in case this changes)
+                    final URI responseUri = requestUri;
+
+                    WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
+                            .blockDigest(responseDigestBuilder.build())
+                            .date(date)
+                            .concurrentTo(warcRequest.id())
+                            .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
+
+                    InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
+                    responseBuilder.ipAddress(inetAddress);
+                    responseBuilder.payloadDigest(payloadDigestBuilder.build());
+                    responseBuilder.truncated(inputBuffer.truncationReason());
+
+                    // Build and write the response
+
+                    var warcResponse = responseBuilder.build();
+                    warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
+                    writer.write(warcResponse);
+
+                    if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
+                            && inputBuffer.size() < 2048
+                            && !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
+                    {
+                        // Fast detection and mitigation of crawler traps that respond with slow
+                        // small responses, with a high branching factor
+
+                        // Note we bail *after* writing the warc records, this will effectively only
+                        // prevent link extraction from the document.
+
+                        logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
+                                requestUri,
+                                Duration.between(date, Instant.now()).getSeconds(),
+                                inputBuffer.size()
+                        );
+
+                        return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
+                    }
+
+                    if (response.getCode() == 301 || response.getCode() == 302 || response.getCode() == 307) {
+                        // If the server responds with a redirect, we need to
+                        // update the request URI to the new location
+                        EdgeUrl redirectLocation = Optional.ofNullable(response.getFirstHeader("Location"))
+                                                           .map(NameValuePair::getValue)
+                                .flatMap(location -> linkParser.parseLink(new EdgeUrl(requestUri), location))
+                                .orElse(null);
+                        if (redirectLocation != null) {
+                            // If the redirect location is a valid URL, we need to update the request URI
+                            return new HttpFetchResult.ResultRedirect(redirectLocation);
+                        } else {
+                            // If the redirect location is not a valid URL, we need to throw an exception
+                            return new HttpFetchResult.ResultException(new IOException("Invalid redirect location: " + response.getFirstHeader("Location")));
+                        }
+                    }
+
+
+                    return new HttpFetchResult.ResultOk(responseUri,
+                            response.getCode(),
+                            inputBuffer.headers(),
+                            inetAddress.getHostAddress(),
+                            responseDataBuffer.data,
+                            dataStart,
+                            responseDataBuffer.length() - dataStart);
+                } catch (Exception ex) {
+                    flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
+                    logger.warn("Failed to fetch URL {}:  {}", requestUri, ex.getMessage());
+                    return new HttpFetchResult.ResultException(ex);
+                }
+            });
+        // the client.execute() method will throw an exception if the request times out
+        // or on other IO exceptions, so we need to catch those here as well as having
+        // exception handling in the response handler
+        } catch (SocketTimeoutException ex) {
+            flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout
            return new HttpFetchResult.ResultException(ex);
-        }
-
-
-        try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response);
-             InputStream inputStream = inputBuffer.read())
-        {
-            if (cookies.hasCookies()) {
-                extraHeaders.put("X-Has-Cookies", List.of("1"));
-            }
-
-            byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
-
-            ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
-
-            responseDataBuffer.put(responseHeaders);
-            responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
-
-            int dataStart = responseDataBuffer.pos();
-
-            for (;;) {
-                int remainingLength = responseDataBuffer.remaining();
-                if (remainingLength == 0)
-                    break;
-
-                int startPos = responseDataBuffer.pos();
-
-                int n = responseDataBuffer.readFrom(inputStream, remainingLength);
-                if (n < 0)
-                    break;
-
-                responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
-                responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
-            }
-
-            // It looks like this might be the same as requestUri, but it's not;
-            // it's the URI after resolving redirects.
-            final URI responseUri = response.uri();
-
-            WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
-                    .blockDigest(responseDigestBuilder.build())
-                    .date(date)
-                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
-
-            InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
-            responseBuilder.ipAddress(inetAddress);
-            responseBuilder.payloadDigest(payloadDigestBuilder.build());
-            responseBuilder.truncated(inputBuffer.truncationReason());
-
-            // Build and write the response
-
-            var warcResponse = responseBuilder.build();
-            warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
-            writer.write(warcResponse);
-
-            // Build and write the request
-
-            WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
-
-            byte[] httpRequestString = WarcProtocolReconstructor
-                    .getHttpRequestString(
-                            response.request().method(),
-                            response.request().headers().map(),
-                            extraHeaders,
-                            requestUri)
-                    .getBytes();
-
-            requestDigestBuilder.update(httpRequestString);
-
-            WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
-                    .blockDigest(requestDigestBuilder.build())
-                    .date(date)
-                    .body(MediaType.HTTP_REQUEST, httpRequestString)
-                    .concurrentTo(warcResponse.id())
-                    .build();
-
-            warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
-            writer.write(warcRequest);
-
-            if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
-                    && inputBuffer.size() < 2048
-                    && !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
-            {
-                // Fast detection and mitigation of crawler traps that respond with slow
-                // small responses, with a high branching factor
-
-                // Note we bail *after* writing the warc records, this will effectively only
-                // prevent link extraction from the document.
-
-                logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
-                        requestUri,
-                        Duration.between(date, Instant.now()).getSeconds(),
-                        inputBuffer.size()
-                );
-
-                return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
-            }
-
-            return new HttpFetchResult.ResultOk(responseUri,
-                    response.statusCode(),
-                    inputBuffer.headers(),
-                    inetAddress.getHostAddress(),
-                    responseDataBuffer.data,
-                    dataStart,
-                    responseDataBuffer.length() - dataStart);
-        }
-        catch (Exception ex) {
+        } catch (IOException ex) {
+            flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
            logger.warn("Failed to fetch URL {}:  {}", requestUri, ex.getMessage());
            return new HttpFetchResult.ResultException(ex);
        }
@@ -214,7 +250,7 @@ public class WarcRecorder implements AutoCloseable {
        writer.write(item);
    }

-    private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
+    private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
        try {
            WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
            WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
@@ -275,7 +311,7 @@ public class WarcRecorder implements AutoCloseable {
                    .date(Instant.now())
                    .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());

-            if (cookies.hasCookies()) {
+            if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
                builder.addHeader("X-Has-Cookies", "1");
            }

@@ -295,8 +331,8 @@ public class WarcRecorder implements AutoCloseable {
     * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified.  In this
     * scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
     */
-    public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
-        saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
+    public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
+        saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
    }

    public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
@@ -316,6 +352,9 @@ public class WarcRecorder implements AutoCloseable {
            case HttpFetcherImpl.DomainProbeResult.Ok ok:
                fields.put("X-WARC-Probe-Status", List.of("OK"));
                break;
+            case HttpFetcher.DomainProbeResult.RedirectSameDomain_Internal redirectSameDomain:
+                fields.put("X-WARC-Probe-Status", List.of("REDIR-INTERNAL"));
+                break;
        }

        var warcinfo = new Warcinfo.Builder()
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/logic/DomainLocks.java
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
 import nu.marginalia.model.EdgeDomain;

 import java.util.Map;
+import java.util.Optional;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Semaphore;

@@ -19,8 +20,22 @@ public class DomainLocks {
     * and may be held by another thread.  The caller is responsible for locking and  releasing the lock.
     */
    public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
-        return new DomainLock(domain.toString(),
-                locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
+        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
+
+        sem.acquire();
+
+        return new DomainLock(sem);
+    }
+
+    public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
+        var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
+        if (sem.tryAcquire(1)) {
+            return Optional.of(new DomainLock(sem));
+        }
+        else {
+            // We don't have a lock, so we return an empty optional
+            return Optional.empty();
+        }
    }

    private Semaphore defaultPermits(String topDomain) {
@@ -28,23 +43,27 @@ public class DomainLocks {
            return new Semaphore(16);
        if (topDomain.equals("blogspot.com"))
            return new Semaphore(8);
-
+        if (topDomain.equals("tumblr.com"))
+            return new Semaphore(8);
        if (topDomain.equals("neocities.org"))
-            return new Semaphore(4);
+            return new Semaphore(8);
        if (topDomain.equals("github.io"))
-            return new Semaphore(4);
+            return new Semaphore(8);

+        // Substack really dislikes broad-scale crawlers, so we need to be careful
+        // to not get blocked.
        if (topDomain.equals("substack.com")) {
            return new Semaphore(1);
        }
-        if (topDomain.endsWith(".edu")) {
-            return new Semaphore(1);
-        }

        return new Semaphore(2);
    }

-    public boolean canLock(EdgeDomain domain) {
+    /** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
+     * (this is just a hint, and does not guarantee that the domain is actually lockable any time
+     * after this method returns true)
+     */
+    public boolean isLockableHint(EdgeDomain domain) {
        Semaphore sem = locks.get(domain.topDomain.toLowerCase());
        if (null == sem)
            return true;
@@ -53,22 +72,16 @@ public class DomainLocks {
    }

    public static class DomainLock implements AutoCloseable {
-        private final String domainName;
        private final Semaphore semaphore;

-        DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
-            this.domainName = domainName;
+        DomainLock(Semaphore semaphore) {
            this.semaphore = semaphore;
-
-            Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
-            semaphore.acquire();
-            Thread.currentThread().setName("crawling:" + domainName);
        }

        @Override
        public void close() throws Exception {
            semaphore.release();
-            Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
+            Thread.currentThread().setName("[idle]");
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;

 import java.time.Duration;
+import java.util.concurrent.ThreadLocalRandom;

 import static java.lang.Math.max;
 import static java.lang.Math.min;
@@ -50,15 +51,20 @@ public class CrawlDelayTimer {
        waitFetchDelay(0);
    }

+    public void waitFetchDelay(Duration spentTime) {
+        waitFetchDelay(spentTime.toMillis());
+    }
+
    public void waitFetchDelay(long spentTime) {
        long sleepTime = delayTime;

+        long jitter = ThreadLocalRandom.current().nextLong(0, 150);
        try {
            if (sleepTime >= 1) {
                if (spentTime > sleepTime)
                    return;

-                Thread.sleep(min(sleepTime - spentTime, 5000));
+                Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
            } else {
                // When no crawl delay is specified, lean toward twice the fetch+process time,
                // within sane limits. This means slower servers get slower crawling, and faster
@@ -71,17 +77,17 @@ public class CrawlDelayTimer {
                if (spentTime > sleepTime)
                    return;

-                Thread.sleep(sleepTime - spentTime);
+                Thread.sleep(sleepTime - spentTime + jitter);
            }

            if (slowDown) {
                // Additional delay when the server is signalling it wants slower requests
-                Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
+                Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
            }
        }
        catch (InterruptedException e) {
            Thread.currentThread().interrupt();
-            throw new RuntimeException();
+            throw new RuntimeException("Interrupted", e);
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerConnectionThrottle.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerConnectionThrottle.java
@@ -0,0 +1,42 @@
+package nu.marginalia.crawl.retreival;
+
+import java.time.Duration;
+import java.time.Instant;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * This class is used to stagger the rate at which connections are created.
+ * <p></p>
+ * It is used to ensure that we do not create too many connections at once,
+ * which can lead to network congestion and other issues.  Since the connections
+ * tend to be very long-lived, we can afford to wait a bit before creating the next
+ * even if it adds a bit of build-up time when the crawl starts.
+ */
+public class CrawlerConnectionThrottle {
+    private Instant lastCrawlStart = Instant.EPOCH;
+    private final Semaphore launchSemaphore = new Semaphore(1);
+
+    private final Duration launchInterval;
+
+    public CrawlerConnectionThrottle(Duration launchInterval) {
+        this.launchInterval = launchInterval;
+    }
+
+    public void waitForConnectionPermission() throws InterruptedException {
+        try {
+            launchSemaphore.acquire();
+            Instant nextPermittedLaunch = lastCrawlStart.plus(launchInterval);
+
+            if (nextPermittedLaunch.isAfter(Instant.now())) {
+                long waitTime = Duration.between(Instant.now(), nextPermittedLaunch).toMillis();
+                TimeUnit.MILLISECONDS.sleep(waitTime);
+            }
+
+            lastCrawlStart = Instant.now();
+        }
+        finally {
+            launchSemaphore.release();
+        }
+    }
+}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -6,8 +6,8 @@ import nu.marginalia.contenttype.ContentType;
 import nu.marginalia.crawl.CrawlerMain;
 import nu.marginalia.crawl.DomainStateDb;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.HttpFetcher;
-import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.logic.LinkFilterSelector;
 import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
@@ -26,14 +26,16 @@ import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.nio.file.Path;
+import java.time.Duration;
+import java.time.Instant;
 import java.util.List;
+import java.util.Objects;
 import java.util.Optional;
 import java.util.concurrent.TimeUnit;

 public class CrawlerRetreiver implements AutoCloseable {

    private static final int MAX_ERRORS = 20;
-    private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once

    private final HttpFetcher fetcher;

@@ -50,6 +52,11 @@ public class CrawlerRetreiver implements AutoCloseable {
    private final DomainStateDb domainStateDb;
    private final WarcRecorder warcRecorder;
    private final CrawlerRevisitor crawlerRevisitor;
+    private final DomainCookies cookies = new DomainCookies();
+
+    private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
+            Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
+    );

    int errorCount = 0;

@@ -90,6 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {

    public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
        try (oldCrawlData) {
+
+            // Wait for permission to open a connection to avoid network congestion
+            // from hundreds/thousands of TCP handshakes
+            connectionThrottle.waitForConnectionPermission();
+
            // Do an initial domain probe to determine the root URL
            var probeResult = probeRootUrl();

@@ -108,15 +120,24 @@ public class CrawlerRetreiver implements AutoCloseable {
                    DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
                    domainStateDb.save(summaryRecord);

+                    if (Thread.interrupted()) {
+                        // There's a small chance we're interrupted during the sniffing portion
+                        throw new InterruptedException();
+                    }
+
+                    Instant recrawlStart = Instant.now();
+                    CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
+                    Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
+
                    // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
-                    if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
+                    if (recrawlMetadata.size() > 0) {
                        // If we have reference data, we will always grow the crawl depth a bit
                        crawlFrontier.increaseDepth(1.5, 2500);
                    }

                    oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources

-                    yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
+                    yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
                }
                case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
@@ -126,6 +147,10 @@ public class CrawlerRetreiver implements AutoCloseable {
                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
                    yield 1;
                }
+                default -> {
+                    logger.error("Unexpected domain probe result {}", probeResult);
+                    yield 1;
+                }
            };

        }
@@ -138,17 +163,29 @@ public class CrawlerRetreiver implements AutoCloseable {
    private int crawlDomain(EdgeUrl rootUrl,
                            SimpleRobotRules robotsRules,
                            CrawlDelayTimer delayTimer,
-                            DomainLinks domainLinks) {
+                            DomainLinks domainLinks,
+                            CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
+                            Duration recrawlTime) {

+        Instant crawlStart = Instant.now();

        // Add external links to the crawl frontier
        crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));

        // Fetch sitemaps
        for (var sitemap : robotsRules.getSitemaps()) {
-            crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
+
+            // Validate the sitemap URL and check if it belongs to the domain as the root URL
+            if (EdgeUrl.parse(sitemap)
+                    .map(url -> url.getDomain().equals(rootUrl.domain))
+                    .orElse(false)) {
+
+                crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
+            }
        }

+        int crawlerAdditions = 0;
+
        while (!crawlFrontier.isEmpty()
            && !crawlFrontier.isCrawlDepthReached()
            && errorCount < MAX_ERRORS
@@ -180,7 +217,11 @@ public class CrawlerRetreiver implements AutoCloseable {
                continue;

            try {
-                fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
+                var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
+
+                if (result.isOk()) {
+                    crawlerAdditions++;
+                }
            }
            catch (InterruptedException ex) {
                Thread.currentThread().interrupt();
@@ -188,6 +229,17 @@ public class CrawlerRetreiver implements AutoCloseable {
            }
        }

+        Duration crawlTime = Duration.between(crawlStart, Instant.now());
+        domainStateDb.save(new DomainStateDb.CrawlMeta(
+                domain,
+                Instant.now(),
+                recrawlTime,
+                crawlTime,
+                recrawlMetadata.errors(),
+                crawlerAdditions,
+                recrawlMetadata.size() + crawlerAdditions
+        ));
+
        return crawlFrontier.visitedSize();
    }

@@ -216,17 +268,29 @@ public class CrawlerRetreiver implements AutoCloseable {
        return domainProbeResult;
    }

+
+
    private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
        Optional<String> feedLink = Optional.empty();

        try {
            var url = rootUrl.withPathAndParam("/", null);

-            HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
+            HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
            timer.waitFetchDelay(0);

-            if (!(result instanceof HttpFetchResult.ResultOk ok))
+            if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
+                if (Objects.equals(location.domain, url.domain)) {
+                    // TODO: Follow the redirect to the new location and sniff the document
+                    crawlFrontier.addFirst(location);
+                }
+
                return DomainStateDb.SummaryRecord.forSuccess(domain);
+            }
+
+            if (!(result instanceof HttpFetchResult.ResultOk ok)) {
+                return DomainStateDb.SummaryRecord.forSuccess(domain);
+            }

            var optDoc = ok.parseDocument();
            if (optDoc.isEmpty())
@@ -275,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {

            // Grab the favicon if it exists

-            if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
+            if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
                String contentType = iconResult.header("Content-Type");
                byte[] iconData = iconResult.getBodyBytes();

@@ -289,6 +353,10 @@ public class CrawlerRetreiver implements AutoCloseable {
        }
        catch (Exception ex) {
            logger.error("Error configuring link filter", ex);
+            if (Thread.interrupted()) {
+                Thread.currentThread().interrupt();
+                return DomainStateDb.SummaryRecord.forError(domain, "Crawler Interrupted", ex.getMessage());
+            }
        }
        finally {
            crawlFrontier.addVisited(rootUrl);
@@ -316,7 +384,7 @@ public class CrawlerRetreiver implements AutoCloseable {
    );

    private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
-        var oldDomainStateRecord = domainStateDb.get(domain);
+        var oldDomainStateRecord = domainStateDb.getSummary(domain);

        // If we are already aware of an old feed URL, then we can just revalidate it
        if (oldDomainStateRecord.isPresent()) {
@@ -341,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
        if (parsedOpt.isEmpty())
            return false;

-        HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
+        HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
        timer.waitFetchDelay(0);

        if (!(result instanceof HttpFetchResult.ResultOk ok)) {
@@ -367,112 +435,63 @@ public class CrawlerRetreiver implements AutoCloseable {
                                                     CrawlDelayTimer timer,
                                                     DocumentWithReference reference) throws InterruptedException
    {
-        logger.debug("Fetching {}", top);
-
-        long startTime = System.currentTimeMillis();
        var contentTags = reference.getContentTags();

-        HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
+        HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
+        timer.waitFetchDelay();
+
+        if (Thread.interrupted()) {
+            Thread.currentThread().interrupt();
+            throw new InterruptedException();
+        }

        // Parse the document and enqueue links
        try {
-            if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
-                var docOpt = ok.parseDocument();
-                if (docOpt.isPresent()) {
-                    var doc = docOpt.get();
+            switch (fetchedDoc) {
+                case HttpFetchResult.ResultOk ok -> {
+                    var docOpt = ok.parseDocument();
+                    if (docOpt.isPresent()) {
+                        var doc = docOpt.get();

-                    var responseUrl = new EdgeUrl(ok.uri());
+                        var responseUrl = new EdgeUrl(ok.uri());

-                    crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
-                    crawlFrontier.addVisited(responseUrl);
+                        crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
+                        crawlFrontier.addVisited(responseUrl);
+                    }
                }
-            }
-            else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
-                var doc = reference.doc();
+                case HttpFetchResult.Result304Raw ref when reference.doc() != null ->
+                {
+                    var doc = reference.doc();

-                warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
+                    warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);

-                fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
-                        new ContentType(doc.contentType, "UTF-8"),
-                        doc.documentBodyBytes);
+                    fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
+                            new ContentType(doc.contentType, "UTF-8"),
+                            doc.documentBodyBytes);

-                if (doc.documentBodyBytes != null) {
-                    var parsed = doc.parseBody();
+                    if (doc.documentBodyBytes != null) {
+                        var parsed = doc.parseBody();

-                    crawlFrontier.enqueueLinksFromDocument(top, parsed);
-                    crawlFrontier.addVisited(top);
+                        crawlFrontier.enqueueLinksFromDocument(top, parsed);
+                        crawlFrontier.addVisited(top);
+                    }
                }
-            }
-            else if (fetchedDoc instanceof HttpFetchResult.ResultException) {
-                errorCount ++;
+                case HttpFetchResult.ResultRedirect(EdgeUrl location) -> {
+                    if (Objects.equals(location.domain, top.domain)) {
+                        crawlFrontier.addFirst(location);
+                    }
+                }
+                case HttpFetchResult.ResultException ex -> errorCount++;
+                default -> {} // Ignore other types
            }
        }
        catch (Exception ex) {
            logger.error("Error parsing document {}", top, ex);
        }

-        timer.waitFetchDelay(System.currentTimeMillis() - startTime);
-
        return fetchedDoc;
    }

-    /** Fetch a document and retry on 429s */
-    private HttpFetchResult fetchWithRetry(EdgeUrl url,
-                                           CrawlDelayTimer timer,
-                                           HttpFetcher.ProbeType probeType,
-                                           ContentTags contentTags) throws InterruptedException {
-
-        long probeStart = System.currentTimeMillis();
-
-        if (probeType == HttpFetcher.ProbeType.FULL) {
-            retryLoop:
-            for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
-                try {
-                    var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
-
-                    switch (probeResult) {
-                        case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
-                            url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
-                            break retryLoop;
-                        case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
-                            return new HttpFetchResult.ResultNone();
-                        case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
-                            return new HttpFetchResult.ResultException(timeout.ex());
-                        case HttpFetcher.ContentTypeProbeResult.Exception exception:
-                            return new HttpFetchResult.ResultException(exception.ex());
-                        default:  // should be unreachable
-                            throw new IllegalStateException("Unknown probe result");
-                    }
-                }
-                catch (HttpFetcherImpl.RateLimitException ex) {
-                    timer.waitRetryDelay(ex);
-                }
-                catch (Exception ex) {
-                    logger.warn("Failed to fetch {}", url, ex);
-                    return new HttpFetchResult.ResultException(ex);
-                }
-            }
-
-            timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
-        }
-
-
-        for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
-            try {
-                return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
-            }
-            catch (HttpFetcherImpl.RateLimitException ex) {
-                timer.waitRetryDelay(ex);
-            }
-            catch (Exception ex) {
-                logger.warn("Failed to fetch {}", url, ex);
-                return new HttpFetchResult.ResultException(ex);
-            }
-        }
-
-        return new HttpFetchResult.ResultNone();
-    }
-
    private boolean isAllowedProtocol(String proto) {
        return proto.equalsIgnoreCase("http")
                || proto.equalsIgnoreCase("https");
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java
@@ -55,6 +55,9 @@ public class DomainCrawlFrontier {
        }
    }

+    public EdgeDomain getDomain() {
+        return thisDomain;
+    }
    /** Increase the depth of the crawl by a factor.  If the current depth is smaller
     * than the number of already visited documents, the base depth will be adjusted
     * to the visited count first.
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;

 import crawlercommons.robots.SimpleRobotRules;
 import nu.marginalia.crawl.fetcher.ContentTags;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.crawl.retreival.CrawlDataReference;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@@ -10,6 +11,8 @@ import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.HttpFetchResult;
 import nu.marginalia.model.crawldata.CrawledDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.io.IOException;

@@ -18,10 +21,13 @@ import java.io.IOException;
 *  E-Tag and Last-Modified headers.
 */
 public class CrawlerRevisitor {
+
    private final DomainCrawlFrontier crawlFrontier;
    private final CrawlerRetreiver crawlerRetreiver;
    private final WarcRecorder warcRecorder;

+    private static final Logger logger = LoggerFactory.getLogger(CrawlerRevisitor.class);
+
    public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
                            CrawlerRetreiver crawlerRetreiver,
                            WarcRecorder warcRecorder) {
@@ -31,7 +37,8 @@ public class CrawlerRevisitor {
    }

    /** Performs a re-crawl of old documents, comparing etags and last-modified */
-    public int recrawl(CrawlDataReference oldCrawlData,
+    public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
+                       DomainCookies cookies,
                       SimpleRobotRules robotsRules,
                       CrawlDelayTimer delayTimer)
    throws InterruptedException {
@@ -39,6 +46,7 @@ public class CrawlerRevisitor {
        int retained = 0;
        int errors = 0;
        int skipped = 0;
+        int size = 0;

        for (CrawledDocument doc : oldCrawlData) {
            if (errors > 20) {
@@ -46,6 +54,10 @@ public class CrawlerRevisitor {
                break;
            }

+            if (Thread.interrupted()) {
+                throw new InterruptedException();
+            }
+
            var urlMaybe = EdgeUrl.parse(doc.url);
            if (urlMaybe.isEmpty())
                continue;
@@ -62,7 +74,7 @@ public class CrawlerRevisitor {

            // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
            // unlikely to produce anything meaningful for us.
-            if (doc.httpStatus != 200)
+            if (doc.httpStatus != 200 && doc.httpStatus != 206)
                continue;
            if (!doc.hasBody())
                continue;
@@ -78,6 +90,7 @@ public class CrawlerRevisitor {
                continue;
            }

+            size++;

            double skipProb;

@@ -121,6 +134,7 @@ public class CrawlerRevisitor {
                }
                // Add a WARC record so we don't repeat this
                warcRecorder.writeReferenceCopy(url,
+                        cookies,
                        doc.contentType,
                        doc.httpStatus,
                        doc.documentBodyBytes,
@@ -145,11 +159,15 @@ public class CrawlerRevisitor {
                else if (result instanceof HttpFetchResult.ResultException) {
                    errors++;
                }
-
                recrawled++;
            }
        }

-        return recrawled;
+        logger.info("Recrawl summary {}: {} recrawled, {} retained, {} errors, {} skipped",
+                crawlFrontier.getDomain(), recrawled, retained, errors, skipped);
+
+        return new RecrawlMetadata(size, errors, skipped);
    }
+
+    public record RecrawlMetadata(int size, int errors, int skipped) {}
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
@@ -6,6 +6,7 @@ import nu.marginalia.model.body.HttpFetchResult;
 import nu.marginalia.model.crawldata.CrawledDocument;

 import javax.annotation.Nullable;
+import java.util.Objects;

 public record DocumentWithReference(
        @Nullable CrawledDocument doc,
@@ -33,8 +34,22 @@ public record DocumentWithReference(
            return false;
        if (doc == null)
            return false;
-        if (doc.documentBodyBytes.length == 0)
-            return false;
+        if (doc.documentBodyBytes.length == 0) {
+            if (doc.httpStatus < 300) {
+                return resultOk.bytesLength() == 0;
+            }
+            else if (doc.httpStatus == 301 || doc.httpStatus == 302 || doc.httpStatus == 307) {
+                @Nullable
+                String docLocation = doc.getHeader("Location");
+                @Nullable
+                String resultLocation = resultOk.header("Location");
+
+                return Objects.equals(docLocation, resultLocation);
+            }
+            else {
+                return doc.httpStatus == resultOk.statusCode();
+            }
+        }

        return CrawlDataReference.isContentBodySame(doc.documentBodyBytes, resultOk.bytesRaw());
    }
@@ -43,7 +58,7 @@ public record DocumentWithReference(
        if (null == doc)
            return ContentTags.empty();

-        if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
+        if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
            return ContentTags.empty();

        String lastmod = doc.getLastModified();
--- a/code/processes/crawling-process/model/build.gradle
+++ b/code/processes/crawling-process/model/build.gradle
@@ -41,6 +41,8 @@ dependencies {
    implementation libs.snakeyaml
    implementation libs.zstd

+    implementation libs.bundles.httpcomponents
+
    testImplementation libs.bundles.slf4j.test
    testImplementation libs.bundles.junit
    testImplementation libs.mockito
--- a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -1,22 +1,32 @@
 package nu.marginalia;

+import org.apache.commons.lang3.StringUtils;
+
 import java.util.Set;

 public class ContentTypes {
    public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
            "application/xhtml",
            "text/html",
+            "text/markdown",
+            "text/x-markdown",
+            "application/pdf",
            "image/x-icon",
            "text/plain");

    public static boolean isAccepted(String contentTypeHeader) {
-        String lcHeader = contentTypeHeader.toLowerCase();
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
        for (var type : acceptedContentTypes) {
-            if (lcHeader.startsWith(type)) {
+            if (lcHeader.equals(type)) {
                return true;
            }
        }
        return false;
    }

+    public static boolean isBinary(String contentTypeHeader) {
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
+        return lcHeader.startsWith("application/pdf");
+    }
+
 }
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
    {

        String fileName = fullPath.getFileName().toString();
-        if (fileName.endsWith(".parquet")) {
+
+        if (fileName.endsWith(".slop.zip")) {
            try {
-                return new ParquetSerializableCrawlDataStream(fullPath);
+                return new SlopSerializableCrawlDataStream(fullPath);
            } catch (Exception ex) {
                logger.error("Error reading domain data from " + fullPath, ex);
                return SerializableCrawlDataStream.empty();
            }
        }

-        if (fileName.endsWith(".slop.zip")) {
+        else if (fileName.endsWith(".parquet")) {
+            logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
            try {
-                return new SlopSerializableCrawlDataStream(fullPath);
+                return new ParquetSerializableCrawlDataStream(fullPath);
            } catch (Exception ex) {
                logger.error("Error reading domain data from " + fullPath, ex);
                return SerializableCrawlDataStream.empty();
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
            public boolean filter(String url, int status, String contentType) {
                String ctLc = contentType.toLowerCase();

+                // Permit all plain text content types
                if (ctLc.startsWith("text/"))
                    return true;
+                // PDF
+                else if (ctLc.startsWith("application/pdf"))
+                    return true;
                else if (ctLc.startsWith("x-marginalia/"))
                    return true;

--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;

 public class ContentTypeLogic {

-    private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
+    private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
    private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
    private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
    private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
            "application/rss+xml",
            "application/x-rss+xml",
            "application/rdf+xml",
+            "application/pdf",
            "x-rss+xml"
    );
    private boolean allowAllContentTypes = false;
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
    public boolean isUrlLikeBinary(EdgeUrl url) {
        String pathLowerCase = url.path.toLowerCase();

-        if (probableHtmlPattern.test(pathLowerCase))
+        if (probableGoodPattern.test(pathLowerCase))
            return false;

        return probableBinaryPattern.test(pathLowerCase);
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
@@ -1,6 +1,9 @@
 package nu.marginalia.model.body;

 import nu.marginalia.contenttype.ContentType;
+import nu.marginalia.model.EdgeUrl;
+import org.apache.hc.core5.http.Header;
+import org.apache.hc.core5.http.message.BasicHeader;
 import org.jetbrains.annotations.Nullable;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
@@ -11,8 +14,10 @@ import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.net.InetAddress;
 import java.net.URI;
-import java.net.http.HttpHeaders;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;

 /* FIXME:  This interface has a very unfortunate name that is not very descriptive.
 */
@@ -56,7 +61,7 @@ public sealed interface HttpFetchResult {
     */
    record ResultOk(URI uri,
                    int statusCode,
-                    HttpHeaders headers,
+                    Header[] headers,
                    String ipAddress,
                    byte[] bytesRaw, // raw data for the entire response including headers
                    int bytesStart,
@@ -67,18 +72,19 @@ public sealed interface HttpFetchResult {
            this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
        }

-        private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
-            Map<String, List<String>> inputMap = messageHeaders.map();
-            Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
+        private static Header[] convertHeaders(MessageHeaders messageHeaders) {
+            List<Header> headers = new ArrayList<>(12);

-            inputMap.forEach((k, v) -> {
+            messageHeaders.map().forEach((k, v) -> {
                if (k.isBlank()) return;
                if (!Character.isAlphabetic(k.charAt(0))) return;

-                filteredMap.put(k, v);
+                for (var value : v) {
+                    headers.add(new BasicHeader(k, value));
+                }
            });

-            return HttpHeaders.of(filteredMap, (k,v) -> true);
+            return headers.toArray(new Header[0]);
        }

        public boolean isOk() {
@@ -108,7 +114,13 @@ public sealed interface HttpFetchResult {

        @Nullable
        public String header(String name) {
-            return headers.firstValue(name).orElse(null);
+            for (var header : headers) {
+                if (header.getName().equalsIgnoreCase(name)) {
+                    String headerValue = header.getValue();
+                    return headerValue;
+                }
+            }
+            return null;
        }

    }
@@ -132,6 +144,12 @@ public sealed interface HttpFetchResult {
        }
    }

+    record ResultRedirect(EdgeUrl url) implements HttpFetchResult {
+        public boolean isOk() {
+            return true;
+        }
+    }
+
    /** Fetching resulted in a HTTP 304, the remote content is identical to
     * our reference copy.  This will be replaced with a Result304ReplacedWithReference
     * at a later stage.
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java
@@ -102,7 +102,7 @@ public final class CrawledDocument implements SerializableCrawlData {
    }

    @Nullable
-    private String getHeader(String header) {
+    public String getHeader(String header) {
        if (headers == null) {
            return null;
        }
--- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java
@@ -165,12 +165,26 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
            contentType = "";
        }

+        boolean hasCookies = false;
+        String etag = null;
+        String lastModified = null;
+
        StringJoiner headersStrBuilder = new StringJoiner("\n");
-        for (var header : headers.map().entrySet()) {
-            for (var value : header.getValue()) {
-                headersStrBuilder.add(header.getKey() + ": " + value);
+        for (var header : headers) {
+            if (header.getName().equalsIgnoreCase("X-Has-Cookies")) {
+                hasCookies = hasCookies || header.getValue().equals("1");
+            }
+            else if (header.getName().equalsIgnoreCase("ETag")) {
+                etag = header.getValue();
+            }
+            else if (header.getName().equalsIgnoreCase("Last-Modified")) {
+                lastModified = header.getValue();
+            }
+            else {
+                headersStrBuilder.add(header.getName() + ": " + header.getValue());
            }
        }
+
        String headersStr = headersStrBuilder.toString();


@@ -178,14 +192,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
                domain,
                response.target(),
                fetchOk.ipAddress(),
-                headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
+                hasCookies,
                fetchOk.statusCode(),
                response.date(),
                contentType,
                bodyBytes,
                headersStr,
-                headers.firstValue("ETag").orElse(null),
-                headers.firstValue("Last-Modified").orElse(null)
+                etag,
+                lastModified
        ));
    }

--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -158,11 +158,12 @@ public record SlopCrawlDataRecord(String domain,
                        // and is used to store old responses from previous crawls; in this part of the logic
                        // we treat them the same as a normal response

-                        if (!filterResponse(uaString, response)) {
+                        var filterStatus = filterResponse(uaString, response);
+                        if (filterStatus.isRejected()) {
                            continue;
                        }

-                        slopWriter.write(domain, response);
+                        slopWriter.write(domain, filterStatus, response);
                    } else if (record instanceof WarcXEntityRefused refused) {
                        slopWriter.write(domain, refused);
                    } else if (record instanceof Warcinfo warcinfo) {
@@ -187,25 +188,35 @@ public record SlopCrawlDataRecord(String domain,
        }
    }

-
+    sealed interface ResponseFilterResult {
+        default boolean isRejected() { return false; }
+        record Accept() implements ResponseFilterResult {}
+        record AcceptWithContentType(String contentType) implements ResponseFilterResult {}
+        record AcceptIfPlainText(String contentType) implements ResponseFilterResult {}
+        record Reject() implements ResponseFilterResult {
+            @Override
+            public boolean isRejected() { return true; }
+        }
+    }

    /** Return true if the WarcResponse should be excluded from conversion */
-    private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
+    private static ResponseFilterResult filterResponse(String uaString, WarcResponse response) throws IOException {

        // We don't want to store robots.txt files, as they are not
        // interesting for the analysis we want to do.  This is important
        // since txt-files in general are interesting, and we don't want to
        // exclude them as a class.

-        if (response.targetURI().getPath().equals("/robots.txt")) {
-            return false;
+        String uriPath = response.targetURI().getPath();
+        if (uriPath.equals("/robots.txt")) {
+            return new ResponseFilterResult.Reject();
        }

        var headers = response.http().headers();
        var robotsTags = headers.all("X-Robots-Tag");

        if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
-            return false;
+            return new ResponseFilterResult.Reject();
        }

        // Strip out responses with content types we aren't interested in
@@ -213,10 +224,29 @@ public record SlopCrawlDataRecord(String domain,
        String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();

        if (!ContentTypes.isAccepted(contentType)) {
-            return false;
+            String contentTypeWithoutParams = StringUtils.substringBefore(contentType, ";");
+
+            // Some servers don't understand what a markdown file is
+            if (contentTypeWithoutParams.equals("application/octet-stream")) {
+                if (uriPath.endsWith(".md")) {
+                    // This is a markdown file, which we want to keep
+                    return new ResponseFilterResult.AcceptIfPlainText("text/markdown");
+                }
+                else if (uriPath.endsWith(".pdf")) {
+                    // This is a text file, which we want to keep
+                    return new ResponseFilterResult.AcceptWithContentType("application/pdf");
+                }
+            }
+
+            return new ResponseFilterResult.Reject();
        }

-        return true;
+        // If the format is binary, we don't want to translate it if the response is truncated
+        if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
+            return new ResponseFilterResult.Reject();
+        }
+
+        return new ResponseFilterResult.Accept();
    }

    /**  Check X-Robots-Tag header tag to see if we are allowed to index this page.
@@ -272,7 +302,8 @@ public record SlopCrawlDataRecord(String domain,
        try (var table = new SlopTable(path)) {
            ShortColumn.Reader statusReader = statusColumn.open(table);
            while (statusReader.hasRemaining()) {
-                if (statusReader.get() == 200) {
+                int status = statusReader.get();
+                if (status == 200 || status == 206) {
                    cnt++;
                }
            }
@@ -318,7 +349,7 @@ public record SlopCrawlDataRecord(String domain,
            headerColumnWriter.put(record.headers);
        }

-        public void write(String domain, WarcResponse response) throws IOException {
+        public void write(String domain, ResponseFilterResult filterStatus, WarcResponse response) throws IOException {

            HttpFetchResult result = HttpFetchResult.importWarc(response);
            if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
@@ -341,12 +372,30 @@ public record SlopCrawlDataRecord(String domain,
                contentType = "";
            }

+            switch (filterStatus) {
+                case ResponseFilterResult.AcceptWithContentType(String ct) -> contentType = ct;
+                case ResponseFilterResult.AcceptIfPlainText(String ct) -> {
+                    try {
+                        // Parse the body as UTF-8
+                        new String(bodyBytes, StandardCharsets.UTF_8);
+                        contentType = ct;
+                    }
+                    catch (RuntimeException ex) { // UTF-8 decoding failed
+                        return;
+                    }
+                }
+                default -> {}
+            }
+
+            boolean hasCookies = false;
+
            String headersStr;
            StringJoiner headersStrBuilder = new StringJoiner("\n");
-            for (var header : headers.map().entrySet()) {
-                for (var value : header.getValue()) {
-                    headersStrBuilder.add(header.getKey() + ": " + value);
+            for (var header : headers) {
+                if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
+                    hasCookies = true;
                }
+                headersStrBuilder.add(header.getName() + ": " + header.getValue());
            }
            headersStr = headersStrBuilder.toString();

@@ -355,7 +404,7 @@ public record SlopCrawlDataRecord(String domain,
                    domain,
                    response.target(),
                    fetchOk.ipAddress(),
-                    "1".equals(headers.firstValue("X-Cookies").orElse("0")),
+                    hasCookies,
                    fetchOk.statusCode(),
                    response.date().toEpochMilli(),
                    contentType,
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java
@@ -8,6 +8,7 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
+import java.time.Duration;
 import java.time.Instant;

 import static org.junit.jupiter.api.Assertions.*;
@@ -47,8 +48,8 @@ class DomainStateDbTest {
            db.save(allFields);
            db.save(minFields);

-            assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
-            assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
+            assertEquals(allFields, db.getSummary("all.marginalia.nu").orElseThrow());
+            assertEquals(minFields, db.getSummary("min.marginalia.nu").orElseThrow());

            var updatedAllFields = new DomainStateDb.SummaryRecord(
                    "all.marginalia.nu",
@@ -59,7 +60,19 @@ class DomainStateDbTest {
            );

            db.save(updatedAllFields);
-            assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
+            assertEquals(updatedAllFields, db.getSummary("all.marginalia.nu").orElseThrow());
+        }
+    }
+
+    @Test
+    public void testMetadata() throws SQLException {
+        try (var db = new DomainStateDb(tempFile)) {
+            var original = new DomainStateDb.CrawlMeta("example.com", Instant.ofEpochMilli(12345), Duration.ofMillis(30), Duration.ofMillis(300), 1, 2, 3);
+            db.save(original);
+
+            var maybeMeta = db.getMeta("example.com");
+            assertTrue(maybeMeta.isPresent());
+            assertEquals(original, maybeMeta.get());
        }
    }

--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplContentTypeProbeTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplContentTypeProbeTest.java
@@ -0,0 +1,146 @@
+package nu.marginalia.crawl.fetcher;
+
+import com.github.tomakehurst.wiremock.WireMockServer;
+import com.github.tomakehurst.wiremock.client.WireMock;
+import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
+import nu.marginalia.UserAgent;
+import nu.marginalia.crawl.retreival.CrawlDelayTimer;
+import nu.marginalia.model.EdgeUrl;
+import org.junit.jupiter.api.*;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+@Tag("slow")
+class HttpFetcherImplContentTypeProbeTest {
+
+    private HttpFetcherImpl fetcher;
+    private  static WireMockServer wireMockServer;
+
+    private static EdgeUrl timeoutUrl;
+    private static EdgeUrl contentTypeHtmlUrl;
+    private static EdgeUrl contentTypeBinaryUrl;
+    private static EdgeUrl redirectUrl;
+    private static EdgeUrl badHttpStatusUrl;
+    private static EdgeUrl onlyGetAllowedUrl;
+
+    @BeforeAll
+    public static void setupAll() throws URISyntaxException {
+        wireMockServer =
+                new WireMockServer(WireMockConfiguration.wireMockConfig()
+                        .port(18089));
+
+        timeoutUrl = new EdgeUrl("http://localhost:18089/timeout.bin");
+
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(timeoutUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withFixedDelay(15000))); // 10 seconds delay to simulate timeout
+
+        contentTypeHtmlUrl = new EdgeUrl("http://localhost:18089/test.html.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(contentTypeHtmlUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withStatus(200)));
+
+        contentTypeBinaryUrl = new EdgeUrl("http://localhost:18089/test.bad.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(contentTypeBinaryUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "application/octet-stream")
+                        .withStatus(200)));
+
+        redirectUrl = new EdgeUrl("http://localhost:18089/redirect.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(redirectUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Location", "http://localhost:18089/test.html.bin")
+                        .withStatus(301)));
+
+        badHttpStatusUrl = new EdgeUrl("http://localhost:18089/badstatus.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(badHttpStatusUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withStatus(500)));
+
+        onlyGetAllowedUrl = new EdgeUrl("http://localhost:18089/onlyget.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(onlyGetAllowedUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withStatus(405))); // Method Not Allowed
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(onlyGetAllowedUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withStatus(200)));
+
+        wireMockServer.start();
+
+    }
+
+    @AfterAll
+    public static void tearDownAll() {
+        wireMockServer.stop();
+    }
+
+    @BeforeEach
+    public void setUp() {
+        fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
+    }
+
+    @AfterEach
+    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
+        fetcher.close();
+    }
+
+    @Test
+    public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
+        var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
+    }
+
+
+    @Test
+    public void testProbeContentTypeHtmlShortcircuitTags() {
+        var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), new ContentTags("a", "b"));
+        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
+    }
+
+    @Test
+    public void testProbeContentTypeHtml() {
+        var result = fetcher.probeContentType(contentTypeHtmlUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
+    }
+
+    @Test
+    public void testProbeContentTypeBinary() {
+        var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
+    }
+
+    @Test
+    public void testProbeContentTypeRedirect() {
+        var result = fetcher.probeContentType(redirectUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
+    }
+
+    @Test
+    public void testProbeContentTypeBadHttpStatus() {
+        var result = fetcher.probeContentType(badHttpStatusUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
+    }
+
+    @Test
+    public void testOnlyGetAllowed() {
+        var result = fetcher.probeContentType(onlyGetAllowedUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
+    }
+
+    @Test
+    public void testTimeout() {
+        var result = fetcher.probeContentType(timeoutUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
+        Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
+    }
+
+}
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplDomainProbeTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplDomainProbeTest.java
@@ -0,0 +1,95 @@
+package nu.marginalia.crawl.fetcher;
+
+import com.github.tomakehurst.wiremock.WireMockServer;
+import com.github.tomakehurst.wiremock.client.WireMock;
+import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
+import nu.marginalia.UserAgent;
+import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.EdgeUrl;
+import nu.marginalia.model.crawldata.CrawlerDomainStatus;
+import org.junit.jupiter.api.*;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+@Tag("slow")
+class HttpFetcherImplDomainProbeTest {
+
+    private HttpFetcherImpl fetcher;
+    private  static WireMockServer wireMockServer;
+
+    private static EdgeUrl timeoutUrl;
+
+    @BeforeAll
+    public static void setupAll() throws URISyntaxException {
+        wireMockServer =
+                new WireMockServer(WireMockConfiguration.wireMockConfig()
+                        .port(18089));
+
+
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo("/timeout"))
+                .willReturn(WireMock.aResponse()
+                        .withFixedDelay(15000))); // 10 seconds delay to simulate timeout
+
+        wireMockServer.start();
+        timeoutUrl = new EdgeUrl("http://localhost:18089/timeout");
+    }
+
+    @AfterAll
+    public static void tearDownAll() {
+        wireMockServer.stop();
+    }
+
+    @BeforeEach
+    public void setUp() {
+        fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
+    }
+
+    @AfterEach
+    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
+        fetcher.close();
+    }
+
+    @Test
+    public void testProbeDomain() throws URISyntaxException {
+        var result = fetcher.probeDomain(new EdgeUrl("https://www.marginalia.nu/"));
+        Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://www.marginalia.nu/")), result);
+    }
+
+    @Test
+    public void testProbeDomainProtoUpgrade() throws URISyntaxException {
+        var result = fetcher.probeDomain(new EdgeUrl("http://www.marginalia.nu/"));
+        Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://www.marginalia.nu/")), result);
+    }
+
+    @Test
+    public void testProbeDomainRedirect() throws URISyntaxException {
+        var result = fetcher.probeDomain(new EdgeUrl("http://search.marginalia.nu/"));
+        Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Redirect(new EdgeDomain("marginalia-search.com")), result);
+    }
+
+    @Test
+    public void testProbeDomainOnlyGET() throws URISyntaxException {
+        // This test is to check if the domain probe only allows GET requests
+        var result = fetcher.probeDomain(new EdgeUrl("https://marginalia-search.com/"));
+        Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://marginalia-search.com/")), result);
+    }
+
+    @Test
+    public void testProbeDomainError() throws URISyntaxException {
+        var result = fetcher.probeDomain(new EdgeUrl("https://invalid.example.com/"));
+        Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe"), result);
+    }
+
+    @Test
+    public void testProbeDomainTimeout() throws URISyntaxException {
+        var result = fetcher.probeDomain(timeoutUrl);
+        Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe"), result);
+    }
+}
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/fetcher/HttpFetcherImplFetchTest.java
@@ -0,0 +1,398 @@
+package nu.marginalia.crawl.fetcher;
+
+import com.github.tomakehurst.wiremock.WireMockServer;
+import com.github.tomakehurst.wiremock.client.WireMock;
+import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
+import nu.marginalia.UserAgent;
+import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
+import nu.marginalia.crawl.retreival.CrawlDelayTimer;
+import nu.marginalia.model.EdgeUrl;
+import nu.marginalia.model.body.HttpFetchResult;
+import org.junit.jupiter.api.*;
+import org.netpreserve.jwarc.*;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+@Tag("slow")
+class HttpFetcherImplFetchTest {
+
+    private HttpFetcherImpl fetcher;
+    private static WireMockServer wireMockServer;
+
+    private static String etag = "etag";
+    private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";
+
+    private static EdgeUrl okUrl;
+    private static EdgeUrl okUrlSetsCookie;
+    private static EdgeUrl okRangeResponseUrl;
+    private static EdgeUrl okUrlWith304;
+
+    private static EdgeUrl timeoutUrl;
+    private static EdgeUrl redirectUrl;
+    private static EdgeUrl badHttpStatusUrl;
+    private static EdgeUrl keepAliveUrl;
+
+    private static EdgeUrl pdfUrl;
+
+    @BeforeAll
+    public static void setupAll() throws URISyntaxException {
+        wireMockServer =
+                new WireMockServer(WireMockConfiguration.wireMockConfig()
+                        .port(18089));
+
+        timeoutUrl = new EdgeUrl("http://localhost:18089/timeout.bin");
+
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(timeoutUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withFixedDelay(15000)
+                )); // 15 seconds delay to simulate timeout
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(timeoutUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withFixedDelay(15000)
+                        .withBody("Hello World")
+                )); // 15 seconds delay to simulate timeout
+
+        redirectUrl = new EdgeUrl("http://localhost:18089/redirect.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(redirectUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Location", "http://localhost:18089/test.html.bin")
+                        .withStatus(301)));
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(redirectUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Location", "http://localhost:18089/test.html.bin")
+                        .withStatus(301)));
+
+        badHttpStatusUrl = new EdgeUrl("http://localhost:18089/badstatus");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(badHttpStatusUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withStatus(500)));
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(badHttpStatusUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withStatus(500)));
+
+        okUrl = new EdgeUrl("http://localhost:18089/ok.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withStatus(200)));
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withStatus(200)
+                        .withBody("Hello World")));
+
+        okUrlSetsCookie = new EdgeUrl("http://localhost:18089/okSetCookie.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlSetsCookie.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withHeader("Set-Cookie", "test=1")
+                        .withStatus(200)));
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlSetsCookie.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withHeader("Set-Cookie", "test=1")
+                        .withStatus(200)
+                        .withBody("Hello World")));
+
+        okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
+        wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withHeader("ETag", etag)
+                        .withHeader("Last-Modified", lastModified)
+                        .withStatus(304)));
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlWith304.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withHeader("ETag", etag)
+                        .withHeader("Last-Modified", lastModified)
+                        .withStatus(304)));
+
+        okRangeResponseUrl = new EdgeUrl("http://localhost:18089/okRangeResponse.bin");
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okRangeResponseUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Range", "bytes 0-100/200")
+                        .withBody("Hello World")
+                        .withStatus(206)));
+
+        keepAliveUrl = new EdgeUrl("http://localhost:18089/keepalive.bin");
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(keepAliveUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "text/html")
+                        .withStatus(200)
+                        .withHeader("Keep-Alive", "max=4, timeout=30")
+                        .withBody("Hello")
+                        ));
+
+
+        pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
+        wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
+                .willReturn(WireMock.aResponse()
+                        .withHeader("Content-Type", "application/pdf")
+                        .withStatus(200)
+                        .withBody("Hello World")));
+
+        wireMockServer.start();
+
+    }
+
+    @AfterAll
+    public static void tearDownAll() {
+        wireMockServer.stop();
+    }
+
+
+    WarcRecorder warcRecorder;
+    Path warcFile;
+
+    @BeforeEach
+    public void setUp() throws IOException {
+        fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
+        warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
+        warcRecorder = new WarcRecorder(warcFile);
+    }
+
+    @AfterEach
+    public void tearDown() throws IOException {
+        var stats = fetcher.getPoolStats();
+        assertEquals(0, stats.getLeased());
+        assertEquals(0, stats.getPending());
+
+        System.out.println(stats);
+
+        fetcher.close();
+        warcRecorder.close();
+        Files.deleteIfExists(warcFile);
+    }
+
+
+    @Test
+    public void testFoo() {
+        fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
+    }
+
+    @Test
+    public void testOk_NoProbe() throws IOException {
+        var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+
+        List<WarcRecord> warcRecords = getWarcRecords();
+        assertEquals(2, warcRecords.size());
+        Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
+        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
+
+        WarcResponse response = (WarcResponse) warcRecords.get(1);
+        assertEquals("0", response.http().headers().first("X-Has-Cookies").orElse("0"));
+    }
+
+    @Test
+    public void testOkSetsCookie() throws IOException {
+        var cookies = new DomainCookies();
+        var result = fetcher.fetchContent(okUrlSetsCookie, warcRecorder, cookies, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+
+        List<WarcRecord> warcRecords = getWarcRecords();
+        assertEquals(2, warcRecords.size());
+        Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
+        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
+
+        WarcResponse response = (WarcResponse) warcRecords.get(1);
+        assertEquals("1", response.http().headers().first("X-Has-Cookies").orElse("0"));
+    }
+
+    @Test
+    public void testOk_FullProbe() {
+        var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+    }
+
+    @Test
+    public void testOk304_NoProbe() {
+        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
+        System.out.println(result);
+
+    }
+
+    @Test
+    public void testOk304_FullProbe() {
+        var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
+
+        Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
+        System.out.println(result);
+    }
+
+    @Test
+    public void testBadStatus_NoProbe() throws IOException {
+        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertFalse(result.isOk());
+
+
+        List<WarcRecord> warcRecords = getWarcRecords();
+        assertEquals(2, warcRecords.size());
+        Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
+        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
+    }
+
+    @Test
+    public void testBadStatus_FullProbe() {
+        var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertFalse(result.isOk());
+
+        System.out.println(result);
+    }
+
+    @Test
+    public void testRedirect_NoProbe() throws URISyntaxException, IOException {
+        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
+        assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
+
+        List<WarcRecord> warcRecords = getWarcRecords();
+        assertEquals(2, warcRecords.size());
+        Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
+        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
+    }
+
+    @Test
+    public void testRedirect_FullProbe() throws URISyntaxException {
+        var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
+        assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
+
+        System.out.println(result);
+    }
+
+
+    @Test
+    public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
+        Instant requestStart = Instant.now();
+
+        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
+
+        Instant requestEnd = Instant.now();
+
+        System.out.println(result);
+
+        // Verify that we are actually timing out, and not blocking on the request until it finishes (which would be a bug),
+        // the request will take 15 seconds to complete, so we should be able to timeout before that, something like 10 seconds and change;
+        // but we'll verify that it is less than 15 seconds to make the test less fragile.
+
+        Assertions.assertTrue(requestEnd.isBefore(requestStart.plusSeconds(15)), "Request should have taken less than 15 seconds");
+
+        var records = getWarcRecords();
+        Assertions.assertEquals(1, records.size());
+        Assertions.assertInstanceOf(WarcXEntityRefused.class, records.getFirst());
+        WarcXEntityRefused entity = (WarcXEntityRefused) records.getFirst();
+        assertEquals(WarcXEntityRefused.documentProbeTimeout, entity.profile());
+        assertEquals(timeoutUrl.asURI(), entity.targetURI());
+    }
+
+    @Test
+    public void testRangeResponse() throws IOException {
+        var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+
+        List<WarcRecord> warcRecords = getWarcRecords();
+        assertEquals(2, warcRecords.size());
+        Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
+        Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
+
+        var response = (WarcResponse) warcRecords.get(1);
+        assertEquals("length", response.headers().first("WARC-Truncated").orElse(""));
+    }
+
+    @Test
+    public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
+        Instant requestStart = Instant.now();
+        var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+        Instant requestEnd = Instant.now();
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
+
+
+        // Verify that we are actually timing out, and not blocking on the request until it finishes (which would be a bug),
+        // the request will take 15 seconds to complete, so we should be able to timeout before that, something like 10 seconds and change;
+        // but we'll verify that it is less than 15 seconds to make the test less fragile.
+
+        Assertions.assertTrue(requestEnd.isBefore(requestStart.plusSeconds(15)), "Request should have taken less than 15 seconds");
+
+        var records = getWarcRecords();
+        Assertions.assertEquals(1, records.size());
+        Assertions.assertInstanceOf(WarcXEntityRefused.class, records.getFirst());
+        WarcXEntityRefused entity = (WarcXEntityRefused) records.getFirst();
+        assertEquals(WarcXEntityRefused.documentProbeTimeout, entity.profile());
+        assertEquals(timeoutUrl.asURI(), entity.targetURI());
+    }
+
+    @Test
+    public void testKeepaliveUrl() {
+        // mostly for smoke testing and debugger utility
+        var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+    }
+
+    @Test
+    public void testPdf() {
+        var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
+
+        Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
+        Assertions.assertTrue(result.isOk());
+    }
+
+
+    private List<WarcRecord> getWarcRecords() throws IOException {
+        List<WarcRecord> records = new ArrayList<>();
+
+        System.out.println(Files.readString(warcFile));
+
+        try (var reader = new WarcReader(warcFile)) {
+            WarcXResponseReference.register(reader);
+            WarcXEntityRefused.register(reader);
+
+            for (var record : reader) {
+                // Load the body, we need to do this before we close the reader to have access to the content.
+                if (record instanceof WarcRequest req) {
+                    req.http();
+                } else if (record instanceof WarcResponse rsp) {
+                    rsp.http();
+                }
+
+                records.add(record);
+            }
+        }
+
+        return records;
+    }
+
+
+}
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java
@@ -1,9 +1,12 @@
 package nu.marginalia.crawl.retreival;

-import nu.marginalia.crawl.fetcher.Cookies;
+import nu.marginalia.crawl.fetcher.DomainCookies;
 import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
+import org.apache.hc.client5.http.classic.HttpClient;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -13,8 +16,6 @@ import org.netpreserve.jwarc.WarcResponse;

 import java.io.IOException;
 import java.net.URISyntaxException;
-import java.net.http.HttpClient;
-import java.net.http.HttpRequest;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.security.NoSuchAlgorithmException;
@@ -30,8 +31,7 @@ class CrawlerWarcResynchronizerTest {
    HttpClient httpClient;
    @BeforeEach
    public void setUp() throws Exception {
-        httpClient = HttpClient.newBuilder()
-                .build();
+        httpClient = HttpClients.createDefault();

        fileName = Files.createTempFile("test", ".warc.gz");
        outputFile = Files.createTempFile("test", ".warc.gz");
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {

    @Test
    void run() throws IOException, URISyntaxException {
-        try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
+        try (var oldRecorder = new WarcRecorder(fileName)) {
            fetchUrl(oldRecorder, "https://www.marginalia.nu/");
            fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
            fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {

        var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);

-        try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
+        try (var newRecorder = new WarcRecorder(outputFile)) {
            new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
        }

@@ -78,11 +78,10 @@ class CrawlerWarcResynchronizerTest {
    }

    void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
-        var req = HttpRequest.newBuilder()
-                .uri(new java.net.URI(url))
-                .header("User-agent", "test.marginalia.nu")
-                .header("Accept-Encoding", "gzip")
-                .GET().build();
-        recorder.fetch(httpClient, req);
+        HttpGet request = new HttpGet(url);
+        request.addHeader("User-agent", "test.marginalia.nu");
+        request.addHeader("Accept-Encoding", "gzip");
+
+        recorder.fetch(httpClient, new DomainCookies(), request);
    }
 }
--- a/Show More
+++ b/Show More