|
|
|
@@ -23,6 +23,7 @@ import java.sql.SQLException;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.Collections;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.NoSuchElementException;
|
|
|
|
|
|
|
|
|
|
public class SampleDataExporter {
|
|
|
|
|
private final FileStorageService storageService;
|
|
|
|
@@ -59,12 +60,6 @@ public class SampleDataExporter {
|
|
|
|
|
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
|
|
|
|
|
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
|
|
|
|
|
|
|
|
|
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
|
|
|
|
|
for (var item : entriesAll) {
|
|
|
|
|
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
|
|
|
|
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
|
|
|
|
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
|
|
|
|
@@ -72,24 +67,30 @@ public class SampleDataExporter {
|
|
|
|
|
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
|
|
|
|
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
|
|
|
|
|
|
|
|
|
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
|
|
|
|
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
|
|
|
|
|
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
|
|
|
|
|
) {
|
|
|
|
|
for (var item : entriesAll) {
|
|
|
|
|
Path crawlDataPath = inputDir.resolve(item.relPath());
|
|
|
|
|
if (!Files.exists(crawlDataPath)) continue;
|
|
|
|
|
|
|
|
|
|
if (StringUtils.isBlank(ctFilter)) {
|
|
|
|
|
addFileToTar(stream, crawlDataPath, item.relPath());
|
|
|
|
|
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
|
|
|
|
}
|
|
|
|
|
else /* filter != null */ {
|
|
|
|
|
boolean didFilterData = false;
|
|
|
|
|
Path filteredData = null;
|
|
|
|
|
try {
|
|
|
|
|
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
|
|
|
|
|
didFilterData = true;
|
|
|
|
|
addFileToTar(stream, crawlDataPath, item.relPath());
|
|
|
|
|
filteredData = filterEntries(crawlDataPath, ctFilter);
|
|
|
|
|
addFileToTar(stream, filteredData, item.relPath());
|
|
|
|
|
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
|
|
|
|
}
|
|
|
|
|
catch (NoSuchElementException ex) {
|
|
|
|
|
// Ignore
|
|
|
|
|
}
|
|
|
|
|
finally {
|
|
|
|
|
if (didFilterData) {
|
|
|
|
|
Files.deleteIfExists(crawlDataPath);
|
|
|
|
|
if (filteredData != null) {
|
|
|
|
|
Files.deleteIfExists(filteredData);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@@ -106,12 +107,8 @@ public class SampleDataExporter {
|
|
|
|
|
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Filters the entries in the crawl data file based on the content type.
|
|
|
|
|
* @param crawlDataPath The path to the crawl data file.
|
|
|
|
|
* @param contentTypeFilter The content type to filter by.
|
|
|
|
|
* @return The path to the filtered crawl data file, or null if an error occurred.
|
|
|
|
|
*/
|
|
|
|
|
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
|
|
|
|
|
/** Filters the entries in the crawl data file based on the content type. */
|
|
|
|
|
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
|
|
|
|
|
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
|
|
|
|
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
|
|
|
|
|
|
|
|
@@ -132,8 +129,16 @@ public class SampleDataExporter {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
) {
|
|
|
|
|
boolean wroteEntry = false;
|
|
|
|
|
while (reader.hasRemaining()) {
|
|
|
|
|
writer.write(reader.get());
|
|
|
|
|
var entry = reader.get();
|
|
|
|
|
writer.write(entry);
|
|
|
|
|
|
|
|
|
|
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!wroteEntry) {
|
|
|
|
|
throw new NoSuchElementException("No relevant entries");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
SlopTablePacker.packToSlopZip(tempDir, tempFile);
|
|
|
|
|