mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
4 Commits
deploy-017
...
deploy-017
Author | SHA1 | Date | |
---|---|---|---|
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e |
@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
if (retryQueue.remainingCapacity() > 0) {
|
||||
// Sleep a moment to avoid busy looping via the retry queue
|
||||
// in the case when few tasks remain and almost all are ineligible for
|
||||
// immediate restart
|
||||
Thread.sleep(5);
|
||||
}
|
||||
|
||||
pendingCrawlTasks.remove(domain);
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
@@ -27,11 +28,14 @@ import java.util.NoSuchElementException;
|
||||
|
||||
public class SampleDataExporter {
|
||||
private final FileStorageService storageService;
|
||||
private final ProcessHeartbeat processHeartbeat;
|
||||
|
||||
@Inject
|
||||
public SampleDataExporter(FileStorageService storageService) {
|
||||
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
|
||||
this.storageService = storageService;
|
||||
this.processHeartbeat = processHeartbeat;
|
||||
}
|
||||
|
||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
@@ -68,9 +72,10 @@ public class SampleDataExporter {
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
|
||||
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
|
||||
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
|
||||
) {
|
||||
for (var item : entriesAll) {
|
||||
for (var item : hb.wrap("Scanning", entriesAll)) {
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
if (!Files.exists(crawlDataPath)) continue;
|
||||
|
||||
@@ -112,20 +117,18 @@ public class SampleDataExporter {
|
||||
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
||||
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
||||
|
||||
// We may have debris from a previous run, so let's clean it up
|
||||
if (Files.isDirectory(tempDir)) {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
Files.createDirectory(tempDir);
|
||||
|
||||
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
||||
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
||||
@Override
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
if (contentTypeFilter.equals(contentType))
|
||||
return true;
|
||||
else if (contentType.startsWith("x-marginalia/"))
|
||||
// This is a metadata entry, typically domain or redirect information
|
||||
// let's keep those to not confuse the consumer of the data, which might
|
||||
// expect at least the domain summary
|
||||
return true;
|
||||
return false;
|
||||
return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
|
||||
|| contentType.startsWith("x-marginalia/"); // metadata records
|
||||
}
|
||||
}
|
||||
) {
|
||||
@@ -151,6 +154,21 @@ public class SampleDataExporter {
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
|
||||
if (null == contentType) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* The content type header may have a charset or other parameters, so we need to
|
||||
* check if the mime type is a prefix of the content type. */
|
||||
|
||||
int semicolonIndex = contentType.indexOf(';');
|
||||
if (semicolonIndex >= 0) {
|
||||
return contentType.substring(0, semicolonIndex).equals(mime);
|
||||
}
|
||||
return contentType.equals(mime);
|
||||
}
|
||||
|
||||
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
|
||||
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
|
||||
entry.setSize(Files.size(file));
|
||||
|
Reference in New Issue
Block a user