1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

8 Commits

6 changed files with 31 additions and 30 deletions

View File

@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
// We don't have a lock, so we can't run this task
// we return to avoid blocking the pool for too long
if (lock.isEmpty()) {
if (retryQueue.remainingCapacity() > 0) {
// Sleep a moment to avoid busy looping via the retry queue
// in the case when few tasks remain and almost all are ineligible for
// immediate restart
Thread.sleep(5);
}
pendingCrawlTasks.remove(domain);
retryQueue.put(this);
return;
}

View File

@@ -74,7 +74,7 @@ public class CrawlerRevisitor {
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
// unlikely to produce anything meaningful for us.
if (doc.httpStatus != 200)
if (doc.httpStatus != 200 && doc.httpStatus != 206)
continue;
if (!doc.hasBody())
continue;

View File

@@ -58,7 +58,7 @@ public record DocumentWithReference(
if (null == doc)
return ContentTags.empty();
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
return ContentTags.empty();
String lastmod = doc.getLastModified();

View File

@@ -1,5 +1,7 @@
package nu.marginalia;
import org.apache.commons.lang3.StringUtils;
import java.util.Set;
public class ContentTypes {
@@ -11,9 +13,9 @@ public class ContentTypes {
"text/plain");
public static boolean isAccepted(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
for (var type : acceptedContentTypes) {
if (lcHeader.startsWith(type)) {
if (lcHeader.equals(type)) {
return true;
}
}
@@ -21,7 +23,7 @@ public class ContentTypes {
}
public static boolean isBinary(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
return lcHeader.startsWith("application/pdf");
}

View File

@@ -277,7 +277,8 @@ public record SlopCrawlDataRecord(String domain,
try (var table = new SlopTable(path)) {
ShortColumn.Reader statusReader = statusColumn.open(table);
while (statusReader.hasRemaining()) {
if (statusReader.get() == 200) {
int status = statusReader.get();
if (status == 200 || status == 206) {
cnt++;
}
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.extractor;
import com.google.inject.Inject;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.slop.SlopCrawlDataRecord;
@@ -20,17 +21,18 @@ import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.*;
public class SampleDataExporter {
private final FileStorageService storageService;
private final ProcessHeartbeat processHeartbeat;
@Inject
public SampleDataExporter(FileStorageService storageService) {
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
this.storageService = storageService;
this.processHeartbeat = processHeartbeat;
}
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -67,9 +69,10 @@ public class SampleDataExporter {
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
) {
for (var item : entriesAll) {
for (var item : hb.wrap("Scanning", entriesAll)) {
Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue;
@@ -84,6 +87,9 @@ public class SampleDataExporter {
addFileToTar(stream, filteredData, item.relPath());
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
catch (NoSuchElementException ex) {
// Ignore
}
finally {
if (filteredData != null) {
Files.deleteIfExists(filteredData);
@@ -104,24 +110,22 @@ public class SampleDataExporter {
}
/** Filters the entries in the crawl data file based on the content type. */
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
// We may have debris from a previous run, so let's clean it up
if (Files.isDirectory(tempDir)) {
FileUtils.deleteDirectory(tempDir.toFile());
}
Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType))
return true;
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
|| contentType.startsWith("x-marginalia/"); // metadata records
}
}
) {
@@ -130,11 +134,11 @@ public class SampleDataExporter {
var entry = reader.get();
writer.write(entry);
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
}
if (!wroteEntry) {
throw new IOException("No relevant entries found");
throw new NoSuchElementException("No relevant entries");
}
SlopTablePacker.packToSlopZip(tempDir, tempFile);