1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 17:32:39 +02:00

Compare commits

...

3 Commits

View File

@@ -1,6 +1,7 @@
package nu.marginalia.extractor;
import com.google.inject.Inject;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.slop.SlopCrawlDataRecord;
@@ -27,11 +28,14 @@ import java.util.NoSuchElementException;
public class SampleDataExporter {
private final FileStorageService storageService;
private final ProcessHeartbeat processHeartbeat;
@Inject
public SampleDataExporter(FileStorageService storageService) {
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
this.storageService = storageService;
this.processHeartbeat = processHeartbeat;
}
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -68,9 +72,10 @@ public class SampleDataExporter {
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
) {
for (var item : entriesAll) {
for (var item : hb.wrap("Scanning", entriesAll)) {
Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue;
@@ -112,20 +117,18 @@ public class SampleDataExporter {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
// We may have debris from a previous run, so let's clean it up
if (Files.isDirectory(tempDir)) {
FileUtils.deleteDirectory(tempDir.toFile());
}
Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType))
return true;
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
|| contentType.startsWith("x-marginalia/"); // metadata records
}
}
) {
@@ -151,6 +154,21 @@ public class SampleDataExporter {
return tempFile;
}
private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
if (null == contentType) {
return false;
}
/* The content type header may have a charset or other parameters, so we need to
* check if the mime type is a prefix of the content type. */
int semicolonIndex = contentType.indexOf(';');
if (semicolonIndex >= 0) {
return contentType.substring(0, semicolonIndex).equals(mime);
}
return contentType.equals(mime);
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file));