1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

2 Commits

Author SHA1 Message Date
Viktor Lofgren
fd5af01629 (sample) Ensure we flush the log before adding it to the tar file 2025-05-06 14:43:47 +02:00
Viktor Lofgren
d4c43c7a79 (crawler) Test case for fetching PDFs 2025-05-06 13:45:16 +02:00
3 changed files with 82 additions and 2 deletions

View File

@@ -67,8 +67,6 @@ dependencies {
testImplementation libs.mockito
testImplementation libs.wiremock
testImplementation project(':code:processes:test-data')
}

View File

@@ -117,6 +117,86 @@ class CrawlerRetreiverTest {
}
}
@Test
public void testWarcOutputPDF() throws IOException {
var specs = CrawlerMain.CrawlSpecRecord
.builder()
.crawlDepth(5)
.domain("www.marginalia.nu")
.urls(List.of("https://www.marginalia.nu/junk/test.pdf"))
.build();
Path tempFile = null;
Path slopFile = null;
try {
tempFile = Files.createTempFile("crawling-process", "warc");
slopFile = Files.createTempFile("crawling-process", ".slop.zip");
doCrawl(tempFile, specs);
Set<String> requests = new HashSet<>();
Set<String> responses = new HashSet<>();
// Inspect the WARC file
try (var reader = new WarcReader(tempFile)) {
reader.forEach(record -> {
if (record instanceof WarcRequest req) {
requests.add(req.target());
System.out.println(req.type() + ":" + req.target());
}
else if (record instanceof WarcResponse rsp) {
responses.add(rsp.target());
System.out.println(rsp.type() + ":" + rsp.target());
}
else {
System.out.println(record.type());
}
});
}
assertTrue(requests.contains("https://www.marginalia.nu/junk/test.pdf"));
assertEquals(requests, responses);
// Convert the WARC file to a Slop file
SlopCrawlDataRecord
.convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
CrawledDomain domain = null;
Map<String, CrawledDocument> documents = new HashMap<>();
// Extract the contents of the Slop file
try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
while (stream.hasNext()) {
var doc = stream.next();
if (doc instanceof CrawledDomain dr) {
assertNull(domain);
domain = dr;
}
else if (doc instanceof CrawledDocument dc) {
System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
documents.put(dc.url, dc);
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
// Verify we have a PDF in the Slop file
assertNotNull(domain);
var pdfDoc = documents.get("https://www.marginalia.nu/junk/test.pdf");
assertNotNull(pdfDoc);
assertEquals("https://www.marginalia.nu/junk/test.pdf", pdfDoc.url);
assertEquals(206, pdfDoc.httpStatus);
assertTrue(pdfDoc.documentBodyBytes.length > 100);
}
finally {
if (tempFile != null)
Files.deleteIfExists(tempFile);
if (slopFile != null)
Files.deleteIfExists(slopFile);
}
}
@Test
public void testWarcOutputNoKnownUrls() throws IOException {
var specs = CrawlerMain.CrawlSpecRecord

View File

@@ -98,6 +98,8 @@ public class SampleDataExporter {
}
}
logWriter.flush();
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
}