1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-07 03:42:39 +02:00

Compare commits

...

3 Commits

2 changed files with 22 additions and 15 deletions

View File

@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
// We don't have a lock, so we can't run this task // We don't have a lock, so we can't run this task
// we return to avoid blocking the pool for too long // we return to avoid blocking the pool for too long
if (lock.isEmpty()) { if (lock.isEmpty()) {
if (retryQueue.remainingCapacity() > 0) { pendingCrawlTasks.remove(domain);
// Sleep a moment to avoid busy looping via the retry queue
// in the case when few tasks remain and almost all are ineligible for
// immediate restart
Thread.sleep(5);
}
retryQueue.put(this); retryQueue.put(this);
return; return;
} }

View File

@@ -117,20 +117,18 @@ public class SampleDataExporter {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered"); Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip"); Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
// We may have debris from a previous run, so let's clean it up
if (Files.isDirectory(tempDir)) {
FileUtils.deleteDirectory(tempDir.toFile());
}
Files.createDirectory(tempDir); Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir); try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) { var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override @Override
public boolean filter(String url, int status, String contentType) { public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType)) return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
return true; || contentType.startsWith("x-marginalia/"); // metadata records
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
} }
} }
) { ) {
@@ -156,6 +154,21 @@ public class SampleDataExporter {
return tempFile; return tempFile;
} }
private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
if (null == contentType) {
return false;
}
/* The content type header may have a charset or other parameters, so we need to
* check if the mime type is a prefix of the content type. */
int semicolonIndex = contentType.indexOf(';');
if (semicolonIndex >= 0) {
return contentType.substring(0, semicolonIndex).equals(mime);
}
return contentType.equals(mime);
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException { private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName); var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file)); entry.setSize(Files.size(file));