mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
5 Commits
deploy-016
...
deploy-017
Author | SHA1 | Date | |
---|---|---|---|
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 |
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
@@ -23,14 +24,18 @@ import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
public class SampleDataExporter {
|
||||
private final FileStorageService storageService;
|
||||
private final ProcessHeartbeat processHeartbeat;
|
||||
|
||||
@Inject
|
||||
public SampleDataExporter(FileStorageService storageService) {
|
||||
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
|
||||
this.storageService = storageService;
|
||||
this.processHeartbeat = processHeartbeat;
|
||||
}
|
||||
|
||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
@@ -59,12 +64,6 @@ public class SampleDataExporter {
|
||||
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
for (var item : entriesAll) {
|
||||
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
}
|
||||
|
||||
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
|
||||
@@ -72,24 +71,31 @@ public class SampleDataExporter {
|
||||
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
||||
for (var item : entriesAll) {
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
|
||||
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
|
||||
) {
|
||||
for (var item : hb.wrap("Scanning", entriesAll)) {
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
if (!Files.exists(crawlDataPath)) continue;
|
||||
|
||||
if (StringUtils.isBlank(ctFilter)) {
|
||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
else /* filter != null */ {
|
||||
boolean didFilterData = false;
|
||||
Path filteredData = null;
|
||||
try {
|
||||
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
|
||||
didFilterData = true;
|
||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||
filteredData = filterEntries(crawlDataPath, ctFilter);
|
||||
addFileToTar(stream, filteredData, item.relPath());
|
||||
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
catch (NoSuchElementException ex) {
|
||||
// Ignore
|
||||
}
|
||||
finally {
|
||||
if (didFilterData) {
|
||||
Files.deleteIfExists(crawlDataPath);
|
||||
if (filteredData != null) {
|
||||
Files.deleteIfExists(filteredData);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -106,34 +112,36 @@ public class SampleDataExporter {
|
||||
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
/** Filters the entries in the crawl data file based on the content type.
|
||||
* @param crawlDataPath The path to the crawl data file.
|
||||
* @param contentTypeFilter The content type to filter by.
|
||||
* @return The path to the filtered crawl data file, or null if an error occurred.
|
||||
*/
|
||||
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
|
||||
/** Filters the entries in the crawl data file based on the content type. */
|
||||
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
|
||||
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
||||
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
||||
|
||||
// We may have debris from a previous run, so let's clean it up
|
||||
if (Files.isDirectory(tempDir)) {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
Files.createDirectory(tempDir);
|
||||
|
||||
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
||||
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
||||
@Override
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
if (contentTypeFilter.equals(contentType))
|
||||
return true;
|
||||
else if (contentType.startsWith("x-marginalia/"))
|
||||
// This is a metadata entry, typically domain or redirect information
|
||||
// let's keep those to not confuse the consumer of the data, which might
|
||||
// expect at least the domain summary
|
||||
return true;
|
||||
return false;
|
||||
return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
|
||||
|| contentType.startsWith("x-marginalia/"); // metadata records
|
||||
}
|
||||
}
|
||||
) {
|
||||
boolean wroteEntry = false;
|
||||
while (reader.hasRemaining()) {
|
||||
writer.write(reader.get());
|
||||
var entry = reader.get();
|
||||
writer.write(entry);
|
||||
|
||||
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
|
||||
}
|
||||
|
||||
if (!wroteEntry) {
|
||||
throw new NoSuchElementException("No relevant entries");
|
||||
}
|
||||
|
||||
SlopTablePacker.packToSlopZip(tempDir, tempFile);
|
||||
@@ -146,6 +154,21 @@ public class SampleDataExporter {
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
|
||||
if (null == contentType) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* The content type header may have a charset or other parameters, so we need to
|
||||
* check if the mime type is a prefix of the content type. */
|
||||
|
||||
int semicolonIndex = contentType.indexOf(';');
|
||||
if (semicolonIndex >= 0) {
|
||||
return contentType.substring(0, semicolonIndex).equals(mime);
|
||||
}
|
||||
return contentType.equals(mime);
|
||||
}
|
||||
|
||||
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
|
||||
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
|
||||
entry.setSize(Files.size(file));
|
||||
|
Reference in New Issue
Block a user