1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 17:32:39 +02:00

Compare commits

...

11 Commits

8 changed files with 122 additions and 33 deletions

View File

@@ -67,8 +67,6 @@ dependencies {
testImplementation libs.mockito
testImplementation libs.wiremock
testImplementation project(':code:processes:test-data')
}

View File

@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
// We don't have a lock, so we can't run this task
// we return to avoid blocking the pool for too long
if (lock.isEmpty()) {
if (retryQueue.remainingCapacity() > 0) {
// Sleep a moment to avoid busy looping via the retry queue
// in the case when few tasks remain and almost all are ineligible for
// immediate restart
Thread.sleep(5);
}
pendingCrawlTasks.remove(domain);
retryQueue.put(this);
return;
}

View File

@@ -74,7 +74,7 @@ public class CrawlerRevisitor {
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
// unlikely to produce anything meaningful for us.
if (doc.httpStatus != 200)
if (doc.httpStatus != 200 && doc.httpStatus != 206)
continue;
if (!doc.hasBody())
continue;

View File

@@ -58,7 +58,7 @@ public record DocumentWithReference(
if (null == doc)
return ContentTags.empty();
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
return ContentTags.empty();
String lastmod = doc.getLastModified();

View File

@@ -1,5 +1,7 @@
package nu.marginalia;
import org.apache.commons.lang3.StringUtils;
import java.util.Set;
public class ContentTypes {
@@ -11,9 +13,9 @@ public class ContentTypes {
"text/plain");
public static boolean isAccepted(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
for (var type : acceptedContentTypes) {
if (lcHeader.startsWith(type)) {
if (lcHeader.equals(type)) {
return true;
}
}
@@ -21,7 +23,7 @@ public class ContentTypes {
}
public static boolean isBinary(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
return lcHeader.startsWith("application/pdf");
}

View File

@@ -277,7 +277,8 @@ public record SlopCrawlDataRecord(String domain,
try (var table = new SlopTable(path)) {
ShortColumn.Reader statusReader = statusColumn.open(table);
while (statusReader.hasRemaining()) {
if (statusReader.get() == 200) {
int status = statusReader.get();
if (status == 200 || status == 206) {
cnt++;
}
}

View File

@@ -117,6 +117,86 @@ class CrawlerRetreiverTest {
}
}
@Test
public void testWarcOutputPDF() throws IOException {
var specs = CrawlerMain.CrawlSpecRecord
.builder()
.crawlDepth(5)
.domain("www.marginalia.nu")
.urls(List.of("https://www.marginalia.nu/junk/test.pdf"))
.build();
Path tempFile = null;
Path slopFile = null;
try {
tempFile = Files.createTempFile("crawling-process", "warc");
slopFile = Files.createTempFile("crawling-process", ".slop.zip");
doCrawl(tempFile, specs);
Set<String> requests = new HashSet<>();
Set<String> responses = new HashSet<>();
// Inspect the WARC file
try (var reader = new WarcReader(tempFile)) {
reader.forEach(record -> {
if (record instanceof WarcRequest req) {
requests.add(req.target());
System.out.println(req.type() + ":" + req.target());
}
else if (record instanceof WarcResponse rsp) {
responses.add(rsp.target());
System.out.println(rsp.type() + ":" + rsp.target());
}
else {
System.out.println(record.type());
}
});
}
assertTrue(requests.contains("https://www.marginalia.nu/junk/test.pdf"));
assertEquals(requests, responses);
// Convert the WARC file to a Slop file
SlopCrawlDataRecord
.convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
CrawledDomain domain = null;
Map<String, CrawledDocument> documents = new HashMap<>();
// Extract the contents of the Slop file
try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
while (stream.hasNext()) {
var doc = stream.next();
if (doc instanceof CrawledDomain dr) {
assertNull(domain);
domain = dr;
}
else if (doc instanceof CrawledDocument dc) {
System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
documents.put(dc.url, dc);
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
// Verify we have a PDF in the Slop file
assertNotNull(domain);
var pdfDoc = documents.get("https://www.marginalia.nu/junk/test.pdf");
assertNotNull(pdfDoc);
assertEquals("https://www.marginalia.nu/junk/test.pdf", pdfDoc.url);
assertEquals(206, pdfDoc.httpStatus);
assertTrue(pdfDoc.documentBodyBytes.length > 100);
}
finally {
if (tempFile != null)
Files.deleteIfExists(tempFile);
if (slopFile != null)
Files.deleteIfExists(slopFile);
}
}
@Test
public void testWarcOutputNoKnownUrls() throws IOException {
var specs = CrawlerMain.CrawlSpecRecord

View File

@@ -1,6 +1,7 @@
package nu.marginalia.extractor;
import com.google.inject.Inject;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.slop.SlopCrawlDataRecord;
@@ -20,17 +21,18 @@ import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.*;
public class SampleDataExporter {
private final FileStorageService storageService;
private final ProcessHeartbeat processHeartbeat;
@Inject
public SampleDataExporter(FileStorageService storageService) {
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
this.storageService = storageService;
this.processHeartbeat = processHeartbeat;
}
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -67,9 +69,10 @@ public class SampleDataExporter {
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
) {
for (var item : entriesAll) {
for (var item : hb.wrap("Scanning", entriesAll)) {
Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue;
@@ -84,6 +87,9 @@ public class SampleDataExporter {
addFileToTar(stream, filteredData, item.relPath());
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
catch (NoSuchElementException ex) {
// Ignore
}
finally {
if (filteredData != null) {
Files.deleteIfExists(filteredData);
@@ -92,6 +98,8 @@ public class SampleDataExporter {
}
}
logWriter.flush();
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
}
@@ -104,37 +112,43 @@ public class SampleDataExporter {
}
/** Filters the entries in the crawl data file based on the content type. */
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
// We may have debris from a previous run, so let's clean it up
if (Files.isDirectory(tempDir)) {
FileUtils.deleteDirectory(tempDir.toFile());
}
Files.createDirectory(tempDir);
boolean wroteEntry = false;
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType))
return true;
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
|| contentType.startsWith("x-marginalia/"); // metadata records
}
}
) {
boolean wroteEntry = false;
while (reader.hasRemaining()) {
var entry = reader.get();
writer.write(entry);
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
}
}
catch (Exception ex) {
FileUtils.deleteDirectory(tempDir.toFile());
throw ex;
}
try {
if (!wroteEntry) {
throw new IOException("No relevant entries found");
throw new NoSuchElementException("No relevant entries");
}
SlopTablePacker.packToSlopZip(tempDir, tempFile);