1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

3 Commits

2 changed files with 22 additions and 15 deletions

View File

@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
// We don't have a lock, so we can't run this task
// we return to avoid blocking the pool for too long
if (lock.isEmpty()) {
if (retryQueue.remainingCapacity() > 0) {
// Sleep a moment to avoid busy looping via the retry queue
// in the case when few tasks remain and almost all are ineligible for
// immediate restart
Thread.sleep(5);
}
pendingCrawlTasks.remove(domain);
retryQueue.put(this);
return;
}

View File

@@ -117,20 +117,18 @@ public class SampleDataExporter {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
// We may have debris from a previous run, so let's clean it up
if (Files.isDirectory(tempDir)) {
FileUtils.deleteDirectory(tempDir.toFile());
}
Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType))
return true;
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
|| contentType.startsWith("x-marginalia/"); // metadata records
}
}
) {
@@ -156,6 +154,21 @@ public class SampleDataExporter {
return tempFile;
}
private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
if (null == contentType) {
return false;
}
/* The content type header may have a charset or other parameters, so we need to
* check if the mime type is a prefix of the content type. */
int semicolonIndex = contentType.indexOf(';');
if (semicolonIndex >= 0) {
return contentType.substring(0, semicolonIndex).equals(mime);
}
return contentType.equals(mime);
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file));