(sample) Fix bug where slop files would not be saved despite containing data

(crawler) Fix outdated assumptions about content types and http status codes always being 200 when good.
We now sometimes get 206 when good.
2025-10-06 07:32:38 +02:00 · 2025-05-06 13:38:21 +02:00 · 2025-05-06 13:18:30 +02:00 · 2025-05-06 13:11:52 +02:00 · 2025-05-06 12:56:30 +02:00 · 2025-05-06 12:48:09 +02:00
6 changed files with 48 additions and 46 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
            // We don't have a lock, so we can't run this task
            // we return to avoid blocking the pool for too long
            if (lock.isEmpty()) {
-                if (retryQueue.remainingCapacity() > 0) {
-                    // Sleep a moment to avoid busy looping via the retry queue
-                    // in the case when few tasks remain and almost all are ineligible for
-                    // immediate restart
-                    Thread.sleep(5);
-                }
-
+                pendingCrawlTasks.remove(domain);
                retryQueue.put(this);
                return;
            }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -74,7 +74,7 @@ public class CrawlerRevisitor {

            // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
            // unlikely to produce anything meaningful for us.
-            if (doc.httpStatus != 200)
+            if (doc.httpStatus != 200 && doc.httpStatus != 206)
                continue;
            if (!doc.hasBody())
                continue;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
@@ -58,7 +58,7 @@ public record DocumentWithReference(
        if (null == doc)
            return ContentTags.empty();

-        if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
+        if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
            return ContentTags.empty();

        String lastmod = doc.getLastModified();
--- a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -1,5 +1,7 @@
 package nu.marginalia;

+import org.apache.commons.lang3.StringUtils;
+
 import java.util.Set;

 public class ContentTypes {
@@ -11,9 +13,9 @@ public class ContentTypes {
            "text/plain");

    public static boolean isAccepted(String contentTypeHeader) {
-        String lcHeader = contentTypeHeader.toLowerCase();
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
        for (var type : acceptedContentTypes) {
-            if (lcHeader.startsWith(type)) {
+            if (lcHeader.equals(type)) {
                return true;
            }
        }
@@ -21,7 +23,7 @@ public class ContentTypes {
    }

    public static boolean isBinary(String contentTypeHeader) {
-        String lcHeader = contentTypeHeader.toLowerCase();
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
        return lcHeader.startsWith("application/pdf");
    }

--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -277,7 +277,8 @@ public record SlopCrawlDataRecord(String domain,
        try (var table = new SlopTable(path)) {
            ShortColumn.Reader statusReader = statusColumn.open(table);
            while (statusReader.hasRemaining()) {
-                if (statusReader.get() == 200) {
+                int status = statusReader.get();
+                if (status == 200 || status == 206) {
                    cnt++;
                }
            }
--- a/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
+++ b/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
@@ -1,6 +1,7 @@
 package nu.marginalia.extractor;

 import com.google.inject.Inject;
+import nu.marginalia.process.control.ProcessHeartbeat;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.process.log.WorkLogEntry;
 import nu.marginalia.slop.SlopCrawlDataRecord;
@@ -20,17 +21,18 @@ import java.nio.file.StandardCopyOption;
 import java.nio.file.StandardOpenOption;
 import java.nio.file.attribute.PosixFilePermissions;
 import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;

 public class SampleDataExporter {
    private final FileStorageService storageService;
+    private final ProcessHeartbeat processHeartbeat;

    @Inject
-    public SampleDataExporter(FileStorageService storageService) {
+    public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
        this.storageService = storageService;
+        this.processHeartbeat = processHeartbeat;
    }
+
    public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
        FileStorage destStorage = storageService.getStorage(destId);
        Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -59,12 +61,6 @@ public class SampleDataExporter {
        Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));

-        try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
-            for (var item : entriesAll) {
-                bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
-            }
-        }
-
        Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
        Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n      \"type\": \"CRAWL_DATA\" }\n");
@@ -72,24 +68,31 @@ public class SampleDataExporter {
        var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));

-        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
-            for (var item : entriesAll) {
+        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
+             var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+             var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
+        ) {
+            for (var item : hb.wrap("Scanning", entriesAll)) {
                Path crawlDataPath = inputDir.resolve(item.relPath());
                if (!Files.exists(crawlDataPath)) continue;

                if (StringUtils.isBlank(ctFilter)) {
                    addFileToTar(stream, crawlDataPath, item.relPath());
+                    logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
                }
                else /* filter != null */ {
-                    boolean didFilterData = false;
+                    Path filteredData = null;
                    try {
-                        crawlDataPath = filterEntries(crawlDataPath, ctFilter);
-                        didFilterData = true;
-                        addFileToTar(stream, crawlDataPath, item.relPath());
+                        filteredData = filterEntries(crawlDataPath, ctFilter);
+                        addFileToTar(stream, filteredData, item.relPath());
+                        logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
+                    }
+                    catch (NoSuchElementException ex) {
+                        // Ignore
                    }
                    finally {
-                        if (didFilterData) {
-                            Files.deleteIfExists(crawlDataPath);
+                        if (filteredData != null) {
+                            Files.deleteIfExists(filteredData);
                        }
                    }
                }
@@ -106,34 +109,36 @@ public class SampleDataExporter {
        Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
    }

-    /** Filters the entries in the crawl data file based on the content type.
-     * @param crawlDataPath The path to the crawl data file.
-     * @param contentTypeFilter The content type to filter by.
-     * @return The path to the filtered crawl data file, or null if an error occurred.
-     */
-    private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
+    /** Filters the entries in the crawl data file based on the content type. */
+    private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
        Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
        Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");

+        // We may have debris from a previous run, so let's clean it up
+        if (Files.isDirectory(tempDir)) {
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
        Files.createDirectory(tempDir);

        try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
             var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
                 @Override
                 public boolean filter(String url, int status, String contentType) {
-                     if (contentTypeFilter.equals(contentType))
-                         return true;
-                     else if (contentType.startsWith("x-marginalia/"))
-                         // This is a metadata entry, typically domain or redirect information
-                         // let's keep those to not confuse the consumer of the data, which might
-                         // expect at least the domain summary
-                         return true;
-                     return false;
+                     return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
+                                || contentType.startsWith("x-marginalia/"); // metadata records
                 }
             }
        ) {
+            boolean wroteEntry = false;
            while (reader.hasRemaining()) {
-                writer.write(reader.get());
+                var entry = reader.get();
+                writer.write(entry);
+
+                wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
+            }
+
+            if (!wroteEntry) {
+                throw new NoSuchElementException("No relevant entries");
            }

            SlopTablePacker.packToSlopZip(tempDir, tempFile);
Author	SHA1	Message	Date
Viktor Lofgren	18700e1919	(sample) Fix bug where slop files would not be saved despite containing data	2025-05-06 13:38:21 +02:00
Viktor Lofgren	120b431998	(crawler) Fix outdated assumptions about content types and http status codes always being 200 when good. We now sometimes get 206 when good.	2025-05-06 13:18:30 +02:00
Viktor Lofgren	71dad99326	(crawler) Revisitor should not demand a 200, but support a 206 as well	2025-05-06 13:11:52 +02:00
Viktor Lofgren	c1e8afdf86	(crawler) Remove domains from pending crawl tasks queue when retrying	2025-05-06 12:56:30 +02:00
Viktor Lofgren	fa32dddc24	(sample-actor) Make content type matching lenient with regard to ct parameters such as charset	2025-05-06 12:48:09 +02:00
Viktor Lofgren	a266fcbf30	(sample-actor) Clean up debris from previous runs to avoid errors on re-runs	2025-05-05 13:16:37 +02:00
Viktor Lofgren	6e47e58e0e	(sample-actor) Add progress tracking to sample export actor	2025-05-05 13:04:14 +02:00
Viktor Lofgren	9dc43d8b4a	(sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable.	2025-05-05 12:56:12 +02:00
Viktor Lofgren	83967e3305	(sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable.	2025-05-05 12:50:21 +02:00