(sample-actor) Make content type matching lenient with regard to ct parameters such as charset

(sample-actor) Clean up debris from previous runs to avoid errors on re-runs
(sample-actor) Add progress tracking to sample export actor
2025-10-06 07:32:38 +02:00 · 2025-05-06 12:48:09 +02:00 · 2025-05-05 13:16:37 +02:00 · 2025-05-05 13:04:14 +02:00 · 2025-05-05 12:56:12 +02:00 · 2025-05-05 12:50:21 +02:00
8 changed files with 295 additions and 70 deletions
--- a/code/common/model/java/nu/marginalia/model/EdgeUrl.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeUrl.java
@@ -31,7 +31,7 @@ public class EdgeUrl implements Serializable {

    private static URI parseURI(String url) throws URISyntaxException {
        try {
-            return EdgeUriFactory.uriFromString(url);
+            return EdgeUriFactory.parseURILenient(url);
        } catch (URISyntaxException ex) {
            throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
        }
@@ -112,11 +112,32 @@ public class EdgeUrl implements Serializable {
            sb.append(port);
        }

+        EdgeUriFactory.urlencodePath(sb, path);
+
+        if (param != null) {
+            EdgeUriFactory.urlencodeQuery(sb, param);
+        }
+
+        return sb.toString();
+    }
+
+
+    public String toDisplayString() {
+        StringBuilder sb = new StringBuilder(256);
+
+        sb.append(proto);
+        sb.append("://");
+        sb.append(domain);
+
+        if (port != null) {
+            sb.append(':');
+            sb.append(port);
+        }
+
        sb.append(path);

        if (param != null) {
-            sb.append('?');
-            sb.append(param);
+            sb.append('?').append(param);
        }

        return sb.toString();
@@ -194,16 +215,19 @@ public class EdgeUrl implements Serializable {

 }

-/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
-
-   Here on the Internet, standards are like the picture on the box of the frozen pizza,
-   and what you get is more like what's on the inside, we try to patch things instead,
-   just give it a best-effort attempt att cleaning out broken or unnecessary constructions
-   like bad or missing URLEncoding
- */
 class EdgeUriFactory {
-    public static URI uriFromString(String url) throws URISyntaxException {
-        var s = new StringBuilder();
+    public static URI parseURILenient(String url) throws URISyntaxException {
+
+        if (shouldOmitUrlencodeRepair(url)) {
+            try {
+                return new URI(url);
+            }
+            catch (URISyntaxException ex) {
+                // ignore and run the lenient parser
+            }
+        }
+
+        var s = new StringBuilder(url.length()+8);

        int pathIdx = findPathIdx(url);
        if (pathIdx < 0) { // url looks like http://marginalia.nu
@@ -218,14 +242,18 @@ class EdgeUriFactory {
        int queryIdx = url.indexOf('?');
        if (queryIdx < 0) queryIdx = end;

-        recombinePaths(s, url.substring(pathIdx, queryIdx));
+        urlencodePath(s, url.substring(pathIdx, queryIdx));
        if (queryIdx < end) {
-            recombineQueryString(s, url.substring(queryIdx + 1, end));
+            urlencodeQuery(s, url.substring(queryIdx + 1, end));
        }
        return new URI(s.toString());
    }

-    private static void recombinePaths(StringBuilder sb, String path) {
+    /** Break apart the path element of an URI into its components, and then
+     * urlencode any component that needs it, and recombine it into a single
+     * path element again.
+     */
+    public static void urlencodePath(StringBuilder sb, String path) {
        if (path == null || path.isEmpty()) {
            return;
        }
@@ -236,62 +264,96 @@ class EdgeUriFactory {
            return;
        }

+        boolean shouldUrlEncode = false;
        for (String pathPart : pathParts) {
            if (pathPart.isEmpty()) continue;

            if (needsUrlEncode(pathPart)) {
+                shouldUrlEncode = true;
+                break;
+            }
+        }
+
+        for (String pathPart : pathParts) {
+            if (pathPart.isEmpty()) continue;
+
+            if (shouldUrlEncode) {
                sb.append('/');
-                sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
+                sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
            } else {
                sb.append('/');
                sb.append(pathPart);
            }
        }

+        if (path.endsWith("/")) {
+            sb.append('/');
+        }
+
    }

-    private static void recombineQueryString(StringBuilder sb, String param) {
+    /** Break apart the query element of a URI into its components, and then
+     * urlencode any component that needs it, and recombine it into a single
+     * query element again.
+     */
+    public static void urlencodeQuery(StringBuilder sb, String param) {
        if (param == null || param.isEmpty()) {
            return;
        }

-        sb.append('?');
-        String[] pathParts = StringUtils.split(param, '&');
+        String[] queryParts = StringUtils.split(param, '&');
+
+        boolean shouldUrlEncode = false;
+        for (String queryPart : queryParts) {
+            if (queryPart.isEmpty()) continue;
+
+            if (needsUrlEncode(queryPart)) {
+                shouldUrlEncode = true;
+                break;
+            }
+        }
+
        boolean first = true;
-        for (String pathPart : pathParts) {
-            if (pathPart.isEmpty()) continue;
+        for (String queryPart : queryParts) {
+            if (queryPart.isEmpty()) continue;

            if (first) {
+                sb.append('?');
                first = false;
            } else {
                sb.append('&');
            }
-            if (needsUrlEncode(pathPart)) {
-                sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
+
+            if (shouldUrlEncode) {
+                int idx = queryPart.indexOf('=');
+                if (idx < 0) {
+                    sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
+                } else {
+                    sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
+                    sb.append('=');
+                    sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
+                }
            } else {
-                sb.append(pathPart);
+                sb.append(queryPart);
            }
        }
    }

-
    /** Test if the url element needs URL encoding.
     * <p></p>
     * Note we may have been given an already encoded path element,
     * so we include % and + in the list of good characters
     */
-    private static boolean needsUrlEncode(String urlElement) {
+    static boolean needsUrlEncode(String urlElement) {
        for (int i = 0; i < urlElement.length(); i++) {
            char c = urlElement.charAt(i);

-            if (c >= 'a' && c <= 'z') continue;
-            if (c >= 'A' && c <= 'Z') continue;
-            if (c >= '0' && c <= '9') continue;
-            if ("-_.~+?=&".indexOf(c) >= 0) continue;
+            if (isUrlSafe(c)) continue;
+            if ("+".indexOf(c) >= 0) continue;
            if (c == '%' && i + 2 < urlElement.length()) {
                char c1 = urlElement.charAt(i + 1);
                char c2 = urlElement.charAt(i + 2);
-                if (c1 >= '0' && c1 <= '9' && c2 >= '0' && c2 <= '9') {
+                if (isHexDigit(c1) && isHexDigit(c2)) {
                    i += 2;
                    continue;
                }
@@ -303,10 +365,90 @@ class EdgeUriFactory {
        return false;
    }

+
+    static boolean isUrlSafe(int c) {
+        if (c >= 'a' && c <= 'z') return true;
+        if (c >= 'A' && c <= 'Z') return true;
+        if (c >= '0' && c <= '9') return true;
+        if (c == '-' || c == '_' || c == '.' || c == '~') return true;
+
+        return false;
+    }
+
+    /** Test if the URL is a valid URL that does not need to be
+     * urlencoded.
+     * <p></p>
+     * This is a very simple heuristic test that does not guarantee
+     * that the URL is valid, but it will identify cases where we
+     * are fairly certain that the URL does not need encoding,
+     * so we can skip a bunch of allocations and string operations
+     * that would otherwise be needed to fix the URL.
+     */
+    static boolean shouldOmitUrlencodeRepair(String url) {
+        int idx = 0;
+        final int len = url.length();
+
+        // Validate the scheme
+        while (idx < len - 2) {
+            char c = url.charAt(idx++);
+            if (c == ':') break;
+            if (!isAsciiAlphabetic(c)) return false;
+        }
+        if (url.charAt(idx++) != '/') return false;
+        if (url.charAt(idx++) != '/') return false;
+
+        // Validate the authority
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '/') break;
+            if (c == ':') continue;
+            if (c == '@') continue;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        // Validate the path
+        if (idx >= len) return true;
+
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '?') break;
+            if (c == '/') continue;
+            if (c == '#') return true;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        if (idx >= len) return true;
+
+        // Validate the query
+        while (idx < len) {
+            char c = url.charAt(idx++);
+            if (c == '&') continue;
+            if (c == '=') continue;
+            if (c == '#') return true;
+            if (!isUrlSafe(c)) return false;
+        }
+
+        return true;
+    }
+
+
+    private static boolean isAsciiAlphabetic(int c) {
+        return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    private static boolean isHexDigit(int c) {
+        return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+    }
+
+    /** Find the index of the path element in a URL.
+     * <p></p>
+     * The path element starts after the scheme and authority part of the URL,
+     * which is everything up to and including the first slash after the colon.
+     */
    private static int findPathIdx(String url) throws URISyntaxException {
        int colonIdx = url.indexOf(':');
        if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
-            throw new URISyntaxException(url, "Lacking protocol");
+            throw new URISyntaxException(url, "Lacking scheme");
        }
        return url.indexOf('/', colonIdx + 3);
    }
--- a/code/common/model/test/nu/marginalia/model/EdgeUrlTest.java
+++ b/code/common/model/test/nu/marginalia/model/EdgeUrlTest.java
@@ -24,21 +24,66 @@ class EdgeUrlTest {

    @Test
    void testUriFromString() throws URISyntaxException {
-        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.uriFromString("https://www.example.com/#heredoc").toString());
-        Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.uriFromString("https://www.example.com/%-sign").toString());
-        Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.uriFromString("https://www.example.com/%22-sign").toString());
-        Assertions.assertEquals("https://www.example.com/%0A+%22huh%22", EdgeUriFactory.uriFromString("https://www.example.com/\n \"huh\"").toString());
-        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.uriFromString("https://en.wikipedia.org/wiki/Sámi").toString());
+        // We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
+        // converting it back to a string, we want to ensure there is no changes along the way.
+
+        Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
+        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
+        Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
+
+        Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
+        Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
+        Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
+
+        Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
+        Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
+        Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
+
+        Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
+
+        Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
+
+        Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
+        Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
+        Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
+
+        Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
+        Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
+        Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
+
+        Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
+        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
+        Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
+
+        Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
    }

    @Test
    void testParms() throws URISyntaxException {
        Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
+
        Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
+
        Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
+        Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
+
        Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
+                new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
+
+
        Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
+
        Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
+        Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
+
        Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
        Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
    }
--- a/code/common/service/java/nu/marginalia/service/server/JoobyService.java
+++ b/code/common/service/java/nu/marginalia/service/server/JoobyService.java
@@ -122,6 +122,11 @@ public class JoobyService {
        // single digit percentage difference since HTML already compresses very well with level = 1.
        options.setCompressionLevel(1);

+        // Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
+        // multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
+        // scenario
+        options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
+

        jooby.setServerOptions(options);

--- a/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
+++ b/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
@@ -1,6 +1,7 @@
 package nu.marginalia.extractor;

 import com.google.inject.Inject;
+import nu.marginalia.process.control.ProcessHeartbeat;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.process.log.WorkLogEntry;
 import nu.marginalia.slop.SlopCrawlDataRecord;
@@ -23,14 +24,18 @@ import java.sql.SQLException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.NoSuchElementException;

 public class SampleDataExporter {
    private final FileStorageService storageService;
+    private final ProcessHeartbeat processHeartbeat;

    @Inject
-    public SampleDataExporter(FileStorageService storageService) {
+    public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
        this.storageService = storageService;
+        this.processHeartbeat = processHeartbeat;
    }
+
    public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
        FileStorage destStorage = storageService.getStorage(destId);
        Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -59,12 +64,6 @@ public class SampleDataExporter {
        Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));

-        try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
-            for (var item : entriesAll) {
-                bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
-            }
-        }
-
        Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
        Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n      \"type\": \"CRAWL_DATA\" }\n");
@@ -72,24 +71,31 @@ public class SampleDataExporter {
        var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));

-        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
-            for (var item : entriesAll) {
+        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
+             var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+             var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
+        ) {
+            for (var item : hb.wrap("Scanning", entriesAll)) {
                Path crawlDataPath = inputDir.resolve(item.relPath());
                if (!Files.exists(crawlDataPath)) continue;

                if (StringUtils.isBlank(ctFilter)) {
                    addFileToTar(stream, crawlDataPath, item.relPath());
+                    logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
                }
                else /* filter != null */ {
-                    boolean didFilterData = false;
+                    Path filteredData = null;
                    try {
-                        crawlDataPath = filterEntries(crawlDataPath, ctFilter);
-                        didFilterData = true;
-                        addFileToTar(stream, crawlDataPath, item.relPath());
+                        filteredData = filterEntries(crawlDataPath, ctFilter);
+                        addFileToTar(stream, filteredData, item.relPath());
+                        logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
+                    }
+                    catch (NoSuchElementException ex) {
+                        // Ignore
                    }
                    finally {
-                        if (didFilterData) {
-                            Files.deleteIfExists(crawlDataPath);
+                        if (filteredData != null) {
+                            Files.deleteIfExists(filteredData);
                        }
                    }
                }
@@ -106,34 +112,36 @@ public class SampleDataExporter {
        Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
    }

-    /** Filters the entries in the crawl data file based on the content type.
-     * @param crawlDataPath The path to the crawl data file.
-     * @param contentTypeFilter The content type to filter by.
-     * @return The path to the filtered crawl data file, or null if an error occurred.
-     */
-    private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
+    /** Filters the entries in the crawl data file based on the content type. */
+    private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
        Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
        Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");

+        // We may have debris from a previous run, so let's clean it up
+        if (Files.isDirectory(tempDir)) {
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
        Files.createDirectory(tempDir);

        try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
             var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
                 @Override
                 public boolean filter(String url, int status, String contentType) {
-                     if (contentTypeFilter.equals(contentType))
-                         return true;
-                     else if (contentType.startsWith("x-marginalia/"))
-                         // This is a metadata entry, typically domain or redirect information
-                         // let's keep those to not confuse the consumer of the data, which might
-                         // expect at least the domain summary
-                         return true;
-                     return false;
+                     return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
+                                || contentType.startsWith("x-marginalia/"); // metadata records
                 }
             }
        ) {
+            boolean wroteEntry = false;
            while (reader.hasRemaining()) {
-                writer.write(reader.get());
+                var entry = reader.get();
+                writer.write(entry);
+
+                wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
+            }
+
+            if (!wroteEntry) {
+                throw new NoSuchElementException("No relevant entries");
            }

            SlopTablePacker.packToSlopZip(tempDir, tempFile);
@@ -146,6 +154,21 @@ public class SampleDataExporter {
        return tempFile;
    }

+    private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
+        if (null == contentType) {
+            return false;
+        }
+
+        /* The content type header may have a charset or other parameters, so we need to
+         * check if the mime type is a prefix of the content type. */
+
+        int semicolonIndex = contentType.indexOf(';');
+        if (semicolonIndex >= 0) {
+            return contentType.substring(0, semicolonIndex).equals(mime);
+        }
+        return contentType.equals(mime);
+    }
+
    private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
        var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
        entry.setSize(Files.size(file));
--- a/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java
@@ -180,7 +180,7 @@ public class UrlDetails implements Comparable<UrlDetails> {
     * semantically meaningful codepoints into entity codes */
    public String displayUrl() {
        StringBuilder sb = new StringBuilder();
-        String urlStr = url.toString();
+        String urlStr = url.toDisplayString();
        for (int i = 0; i < urlStr.length(); i++) {
            char c = urlStr.charAt(i);

--- a/code/services-application/status-service/java/nu/marginalia/status/StatusModule.java
+++ b/code/services-application/status-service/java/nu/marginalia/status/StatusModule.java
@@ -20,6 +20,6 @@ public class StatusModule extends AbstractModule {
        bind(String.class)
                .annotatedWith(Names.named("searchEngineTestQuery"))
                .toInstance(System.getProperty("status-service.public-query",
-                        "https://search.marginalia.nu/search?query=plato&ref=marginalia-automatic-metrics"));
+                        "https://marginalia-search.com/search?query=plato&ref=marginalia-automatic-metrics"));
    }
 }
--- a/deploy.txt
+++ b/deploy.txt
@@ -4,3 +4,6 @@
 2025-01-07:  Deploy executor.
 2025-04-24:  Deploy executor.
 2025-04-24:  Deploy assistant.
+2025-05-04:  Deploy qs, search and api-services.
+2025-05-05:  Deploy executor partition 4.
+2025-05-05:  Deploy control.
--- a/tools/deployment/deployment.py
+++ b/tools/deployment/deployment.py
@@ -314,6 +314,13 @@ if __name__ == '__main__':
            deploy_tier=0,
            groups={"all", "core"}
        ),
+        'status': ServiceConfig(
+            gradle_target=':code:services-application:status-service:docker',
+            docker_name='status-service',
+            instances=None,
+            deploy_tier=4,
+            groups={"all"}
+        ),
        'query': ServiceConfig(
            gradle_target=':code:services-core:query-service:docker',
            docker_name='query-service',
Author	SHA1	Message	Date
Viktor Lofgren	fa32dddc24	(sample-actor) Make content type matching lenient with regard to ct parameters such as charset	2025-05-06 12:48:09 +02:00
Viktor Lofgren	a266fcbf30	(sample-actor) Clean up debris from previous runs to avoid errors on re-runs	2025-05-05 13:16:37 +02:00
Viktor Lofgren	6e47e58e0e	(sample-actor) Add progress tracking to sample export actor	2025-05-05 13:04:14 +02:00
Viktor Lofgren	9dc43d8b4a	(sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable.	2025-05-05 12:56:12 +02:00
Viktor Lofgren	83967e3305	(sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable.	2025-05-05 12:50:21 +02:00
Viktor Lofgren	4db980a291	(jooby-service) Set an upper limit on the number of worker threads	2025-05-05 12:40:31 +02:00
Viktor Lofgren	089b177868	(deploy) Executor partition 4.	2025-05-05 12:21:27 +02:00
Viktor Lofgren	9c8e9a68d5	(deploy) Executor partition 4.	2025-05-05 12:00:05 +02:00
Viktor Lofgren	413d5cc788	(url, minor) Fix typo in test	2025-05-04 16:28:30 +02:00
Viktor Lofgren	58539b92ac	(search) Don't show addresses with URLencoding in the UI	2025-05-04 16:26:39 +02:00
Viktor Lofgren	fe72f16df1	(url) Add additional tests for parameter handling	2025-05-04 16:23:39 +02:00
Viktor Lofgren	b49a244a2e	(url) Fix encoding handling of query parameters	2025-05-04 16:18:47 +02:00
Viktor Lofgren	3f0b4c010f	(deploy) Fix deploy script to be aware of the status service	2025-05-04 16:14:07 +02:00
Viktor Lofgren	c6e0cd93f7	(status) Fix status service to poll the new domain	2025-05-04 16:11:08 +02:00
Viktor Lofgren	80a7ccb080	Trigger redeploy of qs, search and api	2025-05-04 16:07:28 +02:00
Viktor Lofgren	54dec347c4	(url) Fix urlencoding issues with certain symbols Optimize the code by adding a simple heuristic for guessing whether we need to repair the URI before we pass it to Java's parser.	2025-05-04 13:39:39 +02:00
Viktor Lofgren	d6ee3f0785	(url) Fix urlencoding issues with certain symbols The urlencoding logic would consider the need to urlencode on an element basis, which is incorrect. Even if we urlencode on an element basis, we should either urlencode or not urlencode, never a mix of the two.	2025-05-04 13:08:49 +02:00
Viktor Lofgren	8be88afcf3	(url) Fix urlencoding issues with certain symbols We also need to apply the fix when performing toString() on the EdgeUrl, the URI class will URLDecode the input. The change also alters the parseURI method to only run the URLEncode-fixer during parsing if URI doesn't throw an exception. This bad path is obviously going to be slower, but realistically, most URLs are valid, so it's probably a significant optimization to do it like this.	2025-05-04 12:58:13 +02:00
Viktor Lofgren	0e3c00d3e1	(url) Fix urlencoding issues with certain symbols Minor fix of issue where url sanitizer would strip some trailing slashes.	2025-05-03 23:58:28 +02:00
Viktor Lofgren	4279a7f1aa	(url) Fix urlencoding issues with certain symbols Minor fix with previously urlencoded codepoints, we need to account for the fact that they are encoded in hexadecimal.	2025-05-03 23:51:39 +02:00