mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-07 03:42:39 +02:00
Compare commits
3 Commits
deploy-017
...
deploy-017
Author | SHA1 | Date | |
---|---|---|---|
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 |
@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
// We don't have a lock, so we can't run this task
|
// We don't have a lock, so we can't run this task
|
||||||
// we return to avoid blocking the pool for too long
|
// we return to avoid blocking the pool for too long
|
||||||
if (lock.isEmpty()) {
|
if (lock.isEmpty()) {
|
||||||
if (retryQueue.remainingCapacity() > 0) {
|
pendingCrawlTasks.remove(domain);
|
||||||
// Sleep a moment to avoid busy looping via the retry queue
|
|
||||||
// in the case when few tasks remain and almost all are ineligible for
|
|
||||||
// immediate restart
|
|
||||||
Thread.sleep(5);
|
|
||||||
}
|
|
||||||
|
|
||||||
retryQueue.put(this);
|
retryQueue.put(this);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@@ -117,20 +117,18 @@ public class SampleDataExporter {
|
|||||||
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
||||||
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
||||||
|
|
||||||
|
// We may have debris from a previous run, so let's clean it up
|
||||||
|
if (Files.isDirectory(tempDir)) {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
Files.createDirectory(tempDir);
|
Files.createDirectory(tempDir);
|
||||||
|
|
||||||
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
||||||
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
||||||
@Override
|
@Override
|
||||||
public boolean filter(String url, int status, String contentType) {
|
public boolean filter(String url, int status, String contentType) {
|
||||||
if (contentTypeFilter.equals(contentType))
|
return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
|
||||||
return true;
|
|| contentType.startsWith("x-marginalia/"); // metadata records
|
||||||
else if (contentType.startsWith("x-marginalia/"))
|
|
||||||
// This is a metadata entry, typically domain or redirect information
|
|
||||||
// let's keep those to not confuse the consumer of the data, which might
|
|
||||||
// expect at least the domain summary
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
) {
|
) {
|
||||||
@@ -156,6 +154,21 @@ public class SampleDataExporter {
|
|||||||
return tempFile;
|
return tempFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
|
||||||
|
if (null == contentType) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The content type header may have a charset or other parameters, so we need to
|
||||||
|
* check if the mime type is a prefix of the content type. */
|
||||||
|
|
||||||
|
int semicolonIndex = contentType.indexOf(';');
|
||||||
|
if (semicolonIndex >= 0) {
|
||||||
|
return contentType.substring(0, semicolonIndex).equals(mime);
|
||||||
|
}
|
||||||
|
return contentType.equals(mime);
|
||||||
|
}
|
||||||
|
|
||||||
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
|
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
|
||||||
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
|
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
|
||||||
entry.setSize(Files.size(file));
|
entry.setSize(Files.size(file));
|
||||||
|
Reference in New Issue
Block a user