(sample) Ensure we finalize the slop.zip file creation when filtering

(sample) Ensure we flush the log before adding it to the tar file
(crawler) Test case for fetching PDFs
2025-10-06 17:32:39 +02:00 · 2025-05-06 14:52:48 +02:00 · 2025-05-06 14:43:47 +02:00 · 2025-05-06 13:45:16 +02:00 · 2025-05-06 13:38:21 +02:00 · 2025-05-06 13:18:30 +02:00
8 changed files with 122 additions and 33 deletions
--- a/code/processes/crawling-process/build.gradle
+++ b/code/processes/crawling-process/build.gradle
@@ -67,8 +67,6 @@ dependencies {
    testImplementation libs.mockito
    testImplementation libs.wiremock

-
-
    testImplementation project(':code:processes:test-data')
 }

--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
            // We don't have a lock, so we can't run this task
            // we return to avoid blocking the pool for too long
            if (lock.isEmpty()) {
-                if (retryQueue.remainingCapacity() > 0) {
-                    // Sleep a moment to avoid busy looping via the retry queue
-                    // in the case when few tasks remain and almost all are ineligible for
-                    // immediate restart
-                    Thread.sleep(5);
-                }
-
+                pendingCrawlTasks.remove(domain);
                retryQueue.put(this);
                return;
            }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -74,7 +74,7 @@ public class CrawlerRevisitor {

            // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
            // unlikely to produce anything meaningful for us.
-            if (doc.httpStatus != 200)
+            if (doc.httpStatus != 200 && doc.httpStatus != 206)
                continue;
            if (!doc.hasBody())
                continue;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java
@@ -58,7 +58,7 @@ public record DocumentWithReference(
        if (null == doc)
            return ContentTags.empty();

-        if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
+        if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
            return ContentTags.empty();

        String lastmod = doc.getLastModified();
--- a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java
@@ -1,5 +1,7 @@
 package nu.marginalia;

+import org.apache.commons.lang3.StringUtils;
+
 import java.util.Set;

 public class ContentTypes {
@@ -11,9 +13,9 @@ public class ContentTypes {
            "text/plain");

    public static boolean isAccepted(String contentTypeHeader) {
-        String lcHeader = contentTypeHeader.toLowerCase();
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
        for (var type : acceptedContentTypes) {
-            if (lcHeader.startsWith(type)) {
+            if (lcHeader.equals(type)) {
                return true;
            }
        }
@@ -21,7 +23,7 @@ public class ContentTypes {
    }

    public static boolean isBinary(String contentTypeHeader) {
-        String lcHeader = contentTypeHeader.toLowerCase();
+        String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
        return lcHeader.startsWith("application/pdf");
    }

--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -277,7 +277,8 @@ public record SlopCrawlDataRecord(String domain,
        try (var table = new SlopTable(path)) {
            ShortColumn.Reader statusReader = statusColumn.open(table);
            while (statusReader.hasRemaining()) {
-                if (statusReader.get() == 200) {
+                int status = statusReader.get();
+                if (status == 200 || status == 206) {
                    cnt++;
                }
            }
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -117,6 +117,86 @@ class CrawlerRetreiverTest {
        }
    }

+
+    @Test
+    public void testWarcOutputPDF() throws IOException {
+        var specs = CrawlerMain.CrawlSpecRecord
+                .builder()
+                .crawlDepth(5)
+                .domain("www.marginalia.nu")
+                .urls(List.of("https://www.marginalia.nu/junk/test.pdf"))
+                .build();
+        Path tempFile = null;
+        Path slopFile = null;
+        try {
+            tempFile = Files.createTempFile("crawling-process", "warc");
+            slopFile = Files.createTempFile("crawling-process", ".slop.zip");
+
+            doCrawl(tempFile, specs);
+
+            Set<String> requests = new HashSet<>();
+            Set<String> responses = new HashSet<>();
+
+            // Inspect the WARC file
+            try (var reader = new WarcReader(tempFile)) {
+                reader.forEach(record -> {
+                    if (record instanceof WarcRequest req) {
+                        requests.add(req.target());
+                        System.out.println(req.type() + ":" + req.target());
+                    }
+                    else if (record instanceof WarcResponse rsp) {
+                        responses.add(rsp.target());
+                        System.out.println(rsp.type() + ":" + rsp.target());
+                    }
+                    else {
+                        System.out.println(record.type());
+                    }
+                });
+            }
+
+            assertTrue(requests.contains("https://www.marginalia.nu/junk/test.pdf"));
+            assertEquals(requests, responses);
+
+            // Convert the WARC file to a Slop file
+            SlopCrawlDataRecord
+                    .convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
+
+            CrawledDomain domain = null;
+            Map<String, CrawledDocument> documents = new HashMap<>();
+
+            // Extract the contents of the Slop file
+            try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
+                while (stream.hasNext()) {
+                    var doc = stream.next();
+                    if (doc instanceof CrawledDomain dr) {
+                        assertNull(domain);
+                        domain = dr;
+                    }
+                    else if (doc instanceof CrawledDocument dc) {
+                        System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
+                        documents.put(dc.url, dc);
+                    }
+                }
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+
+            // Verify we have a PDF in the Slop file
+            assertNotNull(domain);
+            var pdfDoc = documents.get("https://www.marginalia.nu/junk/test.pdf");
+            assertNotNull(pdfDoc);
+            assertEquals("https://www.marginalia.nu/junk/test.pdf", pdfDoc.url);
+            assertEquals(206, pdfDoc.httpStatus);
+            assertTrue(pdfDoc.documentBodyBytes.length > 100);
+        }
+        finally {
+            if (tempFile != null)
+                Files.deleteIfExists(tempFile);
+            if (slopFile != null)
+                Files.deleteIfExists(slopFile);
+        }
+    }
+
    @Test
    public void testWarcOutputNoKnownUrls() throws IOException {
        var specs = CrawlerMain.CrawlSpecRecord
--- a/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
+++ b/code/processes/export-task-process/java/nu/marginalia/extractor/SampleDataExporter.java
@@ -1,6 +1,7 @@
 package nu.marginalia.extractor;

 import com.google.inject.Inject;
+import nu.marginalia.process.control.ProcessHeartbeat;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.process.log.WorkLogEntry;
 import nu.marginalia.slop.SlopCrawlDataRecord;
@@ -20,17 +21,18 @@ import java.nio.file.StandardCopyOption;
 import java.nio.file.StandardOpenOption;
 import java.nio.file.attribute.PosixFilePermissions;
 import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;

 public class SampleDataExporter {
    private final FileStorageService storageService;
+    private final ProcessHeartbeat processHeartbeat;

    @Inject
-    public SampleDataExporter(FileStorageService storageService) {
+    public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
        this.storageService = storageService;
+        this.processHeartbeat = processHeartbeat;
    }
+
    public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
        FileStorage destStorage = storageService.getStorage(destId);
        Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -67,9 +69,10 @@ public class SampleDataExporter {
                PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));

        try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
-             var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
+             var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+             var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
        ) {
-            for (var item : entriesAll) {
+            for (var item : hb.wrap("Scanning", entriesAll)) {
                Path crawlDataPath = inputDir.resolve(item.relPath());
                if (!Files.exists(crawlDataPath)) continue;

@@ -84,6 +87,9 @@ public class SampleDataExporter {
                        addFileToTar(stream, filteredData, item.relPath());
                        logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
                    }
+                    catch (NoSuchElementException ex) {
+                        // Ignore
+                    }
                    finally {
                        if (filteredData != null) {
                            Files.deleteIfExists(filteredData);
@@ -92,6 +98,8 @@ public class SampleDataExporter {
                }
            }

+            logWriter.flush();
+
            addFileToTar(stream, newCrawlerLogFile, "crawler.log");
            addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
        }
@@ -104,37 +112,43 @@ public class SampleDataExporter {
    }

    /** Filters the entries in the crawl data file based on the content type. */
-    private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
+    private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
        Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
        Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");

+        // We may have debris from a previous run, so let's clean it up
+        if (Files.isDirectory(tempDir)) {
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
        Files.createDirectory(tempDir);

+        boolean wroteEntry = false;
+
        try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
             var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
                 @Override
                 public boolean filter(String url, int status, String contentType) {
-                     if (contentTypeFilter.equals(contentType))
-                         return true;
-                     else if (contentType.startsWith("x-marginalia/"))
-                         // This is a metadata entry, typically domain or redirect information
-                         // let's keep those to not confuse the consumer of the data, which might
-                         // expect at least the domain summary
-                         return true;
-                     return false;
+                     return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
+                                || contentType.startsWith("x-marginalia/"); // metadata records
                 }
             }
        ) {
-            boolean wroteEntry = false;
+
            while (reader.hasRemaining()) {
                var entry = reader.get();
                writer.write(entry);

-                wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
+                wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
            }
+        }
+        catch (Exception ex) {
+            FileUtils.deleteDirectory(tempDir.toFile());
+            throw ex;
+        }

+        try {
            if (!wroteEntry) {
-                throw new IOException("No relevant entries found");
+                throw new NoSuchElementException("No relevant entries");
            }

            SlopTablePacker.packToSlopZip(tempDir, tempFile);
Author	SHA1	Message	Date
Viktor Lofgren	c309030184	(sample) Ensure we finalize the slop.zip file creation when filtering	2025-05-06 14:52:48 +02:00
Viktor Lofgren	fd5af01629	(sample) Ensure we flush the log before adding it to the tar file	2025-05-06 14:43:47 +02:00
Viktor Lofgren	d4c43c7a79	(crawler) Test case for fetching PDFs	2025-05-06 13:45:16 +02:00
Viktor Lofgren	18700e1919	(sample) Fix bug where slop files would not be saved despite containing data	2025-05-06 13:38:21 +02:00
Viktor Lofgren	120b431998	(crawler) Fix outdated assumptions about content types and http status codes always being 200 when good. We now sometimes get 206 when good.	2025-05-06 13:18:30 +02:00
Viktor Lofgren	71dad99326	(crawler) Revisitor should not demand a 200, but support a 206 as well	2025-05-06 13:11:52 +02:00
Viktor Lofgren	c1e8afdf86	(crawler) Remove domains from pending crawl tasks queue when retrying	2025-05-06 12:56:30 +02:00
Viktor Lofgren	fa32dddc24	(sample-actor) Make content type matching lenient with regard to ct parameters such as charset	2025-05-06 12:48:09 +02:00
Viktor Lofgren	a266fcbf30	(sample-actor) Clean up debris from previous runs to avoid errors on re-runs	2025-05-05 13:16:37 +02:00
Viktor Lofgren	6e47e58e0e	(sample-actor) Add progress tracking to sample export actor	2025-05-05 13:04:14 +02:00
Viktor Lofgren	9dc43d8b4a	(sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable.	2025-05-05 12:56:12 +02:00