1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

5 Commits

View File

@@ -1,6 +1,7 @@
package nu.marginalia.extractor;
import com.google.inject.Inject;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.slop.SlopCrawlDataRecord;
@@ -23,14 +24,18 @@ import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.NoSuchElementException;
public class SampleDataExporter {
private final FileStorageService storageService;
private final ProcessHeartbeat processHeartbeat;
@Inject
public SampleDataExporter(FileStorageService storageService) {
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
this.storageService = storageService;
this.processHeartbeat = processHeartbeat;
}
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -59,12 +64,6 @@ public class SampleDataExporter {
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
for (var item : entriesAll) {
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
}
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
@@ -72,24 +71,31 @@ public class SampleDataExporter {
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
for (var item : entriesAll) {
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
) {
for (var item : hb.wrap("Scanning", entriesAll)) {
Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue;
if (StringUtils.isBlank(ctFilter)) {
addFileToTar(stream, crawlDataPath, item.relPath());
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
else /* filter != null */ {
boolean didFilterData = false;
Path filteredData = null;
try {
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
didFilterData = true;
addFileToTar(stream, crawlDataPath, item.relPath());
filteredData = filterEntries(crawlDataPath, ctFilter);
addFileToTar(stream, filteredData, item.relPath());
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
catch (NoSuchElementException ex) {
// Ignore
}
finally {
if (didFilterData) {
Files.deleteIfExists(crawlDataPath);
if (filteredData != null) {
Files.deleteIfExists(filteredData);
}
}
}
@@ -106,34 +112,36 @@ public class SampleDataExporter {
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
}
/** Filters the entries in the crawl data file based on the content type.
* @param crawlDataPath The path to the crawl data file.
* @param contentTypeFilter The content type to filter by.
* @return The path to the filtered crawl data file, or null if an error occurred.
*/
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
/** Filters the entries in the crawl data file based on the content type. */
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
// We may have debris from a previous run, so let's clean it up
if (Files.isDirectory(tempDir)) {
FileUtils.deleteDirectory(tempDir.toFile());
}
Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType))
return true;
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
|| contentType.startsWith("x-marginalia/"); // metadata records
}
}
) {
boolean wroteEntry = false;
while (reader.hasRemaining()) {
writer.write(reader.get());
var entry = reader.get();
writer.write(entry);
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
}
if (!wroteEntry) {
throw new NoSuchElementException("No relevant entries");
}
SlopTablePacker.packToSlopZip(tempDir, tempFile);
@@ -146,6 +154,21 @@ public class SampleDataExporter {
return tempFile;
}
private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
if (null == contentType) {
return false;
}
/* The content type header may have a charset or other parameters, so we need to
* check if the mime type is a prefix of the content type. */
int semicolonIndex = contentType.indexOf(';');
if (semicolonIndex >= 0) {
return contentType.substring(0, semicolonIndex).equals(mime);
}
return contentType.equals(mime);
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file));