1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Viktor Lofgren
8be88afcf3 (url) Fix urlencoding issues with certain symbols
We also need to apply the fix when performing toString() on the EdgeUrl, the URI class will URLDecode the input.

The change also alters the parseURI method to only run the URLEncode-fixer during parsing if URI doesn't throw an exception.  This bad path is obviously going to be slower, but realistically, most URLs are valid, so it's probably a significant optimization to do it like this.
2025-05-04 12:58:13 +02:00
Viktor Lofgren
0e3c00d3e1 (url) Fix urlencoding issues with certain symbols
Minor fix of issue where url sanitizer would strip some trailing slashes.
2025-05-03 23:58:28 +02:00
Viktor Lofgren
4279a7f1aa (url) Fix urlencoding issues with certain symbols
Minor fix with previously urlencoded codepoints, we need to account for the fact that they are encoded in hexadecimal.
2025-05-03 23:51:39 +02:00
Viktor Lofgren
251006d4f9 (url) Fix urlencoding issues with certain symbols
Problems primarily cropped up with sideloaded wikipedia articles, though the search engine has been returning inconsistently URLEncoded search results for a while, though browsers and servers have seemingly magically fixed the issues in many scenarios.

This addresses Issue #195 and Issue #131.
2025-05-03 23:48:45 +02:00
Viktor Lofgren
c3e99dc12a (service) Limit logging from ad hoc task heartbeats
Certain usage patterns of the ad hoc task heartbeats would lead to an incredible amount of log noise, as it would log each update.

Limit log updates to increments of 10% to avoid this problem.
2025-05-03 12:39:58 +02:00
Viktor
aaaa2de022 Merge pull request #196 from MarginaliaSearch/filter-export-sample-data
Add the ability to filter sample data based on content type
2025-05-02 13:23:49 +02:00
Viktor Lofgren
fc1388422a (actor) Add the ability to filter sample data based on content type
This will help in extracting relevant test sets for PDF processing.
2025-05-02 13:09:22 +02:00
14 changed files with 294 additions and 110 deletions

View File

@@ -1,16 +1,14 @@
package nu.marginalia.model;
import nu.marginalia.util.QueryParams;
import org.apache.commons.lang3.StringUtils;
import javax.annotation.Nullable;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.*;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;
public class EdgeUrl implements Serializable {
public final String proto;
@@ -33,9 +31,21 @@ public class EdgeUrl implements Serializable {
private static URI parseURI(String url) throws URISyntaxException {
try {
return new URI(urlencodeFixer(url));
} catch (URISyntaxException ex) {
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
return new URI(url);
} catch (URISyntaxException _) {
try {
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
Here on the Internet, standards are like the picture on the box of the frozen pizza,
and what you get is more like what's on the inside, we try to patch things instead,
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
like bad or missing URLEncoding
*/
return EdgeUriFactory.parseURILenient(url);
}
catch (URISyntaxException ex2) {
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex2.getMessage());
}
}
}
@@ -51,58 +61,6 @@ public class EdgeUrl implements Serializable {
}
}
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
Here on the Internet, standards are like the picture on the box of the frozen pizza,
and what you get is more like what's on the inside, we try to patch things instead,
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
like bad or missing URLEncoding
*/
public static String urlencodeFixer(String url) throws URISyntaxException {
var s = new StringBuilder();
String goodChars = "&.?:/-;+$#";
String hexChars = "0123456789abcdefABCDEF";
int pathIdx = findPathIdx(url);
if (pathIdx < 0) { // url looks like http://marginalia.nu
return url + "/";
}
s.append(url, 0, pathIdx);
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
int end = url.indexOf("#");
if (end < 0) end = url.length();
for (int i = pathIdx; i < end; i++) {
int c = url.charAt(i);
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
s.appendCodePoint(c);
} else if (c == '%' && i + 2 < end) {
int cn = url.charAt(i + 1);
int cnn = url.charAt(i + 2);
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
s.appendCodePoint(c);
} else {
s.append("%25");
}
} else {
s.append(String.format("%%%02X", c));
}
}
return s.toString();
}
private static int findPathIdx(String url) throws URISyntaxException {
int colonIdx = url.indexOf(':');
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
throw new URISyntaxException(url, "Lacking protocol");
}
return url.indexOf('/', colonIdx + 2);
}
public EdgeUrl(URI URI) {
try {
@@ -166,11 +124,10 @@ public class EdgeUrl implements Serializable {
sb.append(port);
}
sb.append(path);
EdgeUriFactory.urlencodePath(sb, path);
if (param != null) {
sb.append('?');
sb.append(param);
EdgeUriFactory.urlencodeQuery(sb, param);
}
return sb.toString();
@@ -247,3 +204,138 @@ public class EdgeUrl implements Serializable {
}
}
class EdgeUriFactory {
public static URI parseURILenient(String url) throws URISyntaxException {
var s = new StringBuilder();
int pathIdx = findPathIdx(url);
if (pathIdx < 0) { // url looks like http://marginalia.nu
return new URI(url + "/");
}
s.append(url, 0, pathIdx);
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
int end = url.indexOf("#");
if (end < 0) end = url.length();
int queryIdx = url.indexOf('?');
if (queryIdx < 0) queryIdx = end;
urlencodePath(s, url.substring(pathIdx, queryIdx));
if (queryIdx < end) {
urlencodeQuery(s, url.substring(queryIdx + 1, end));
}
return new URI(s.toString());
}
/** Break apart the path element of an URI into its components, and then
* urlencode any component that needs it, and recombine it into a single
* path element again.
*/
public static void urlencodePath(StringBuilder sb, String path) {
if (path == null || path.isEmpty()) {
return;
}
String[] pathParts = StringUtils.split(path, '/');
if (pathParts.length == 0) {
sb.append('/');
return;
}
for (String pathPart : pathParts) {
if (pathPart.isEmpty()) continue;
if (needsUrlEncode(pathPart)) {
sb.append('/');
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
} else {
sb.append('/');
sb.append(pathPart);
}
}
if (path.endsWith("/")) {
sb.append('/');
}
}
/** Break apart the query element of a URI into its components, and then
* urlencode any component that needs it, and recombine it into a single
* query element again.
*/
public static void urlencodeQuery(StringBuilder sb, String param) {
if (param == null || param.isEmpty()) {
return;
}
String[] pathParts = StringUtils.split(param, '&');
boolean first = true;
for (String queryPart : pathParts) {
if (queryPart.isEmpty()) continue;
if (first) {
sb.append('?');
first = false;
} else {
sb.append('&');
}
if (needsUrlEncode(queryPart)) {
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
} else {
sb.append(queryPart);
}
}
}
/** Test if the url element needs URL encoding.
* <p></p>
* Note we may have been given an already encoded path element,
* so we include % and + in the list of good characters
*/
static boolean needsUrlEncode(String urlElement) {
for (int i = 0; i < urlElement.length(); i++) {
char c = urlElement.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
if ("-_.~+?=&".indexOf(c) >= 0) continue;
if (c == '%' && i + 2 < urlElement.length()) {
char c1 = urlElement.charAt(i + 1);
char c2 = urlElement.charAt(i + 2);
if (isHexDigit(c1) && isHexDigit(c2)) {
i += 2;
continue;
}
}
return true;
}
return false;
}
private static boolean isHexDigit(char c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
/** Find the index of the path element in a URL.
* <p></p>
* The path element starts after the scheme and authority part of the URL,
* which is everything up to and including the first slash after the colon.
*/
private static int findPathIdx(String url) throws URISyntaxException {
int colonIdx = url.indexOf(':');
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
throw new URISyntaxException(url, "Lacking scheme");
}
return url.indexOf('/', colonIdx + 3);
}
}

View File

@@ -1,6 +1,6 @@
package nu.marginalia.model;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
@@ -21,25 +21,50 @@ class EdgeUrlTest {
new EdgeUrl("https://memex.marginalia.nu/#here")
);
}
@Test
public void testParam() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
}
@Test
void urlencodeFixer() throws URISyntaxException {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
void testUriFromString() throws URISyntaxException {
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
// converting it back to a string, we want to ensure there is no changes along the way.
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
}
@Test
void testParms() throws URISyntaxException {
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
}
}

View File

@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
*/
@Override
public void progress(String step, int stepProgress, int stepCount) {
int lastProgress = this.progress;
this.step = step;
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
if (this.progress / 10 != lastProgress / 10) {
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
}
}
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */

View File

@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
*/
@Override
public void progress(String step, int stepProgress, int stepCount) {
int lastProgress = this.progress;
this.step = step;
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
if (this.progress / 10 != lastProgress / 10) {
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
}
}
public void shutDown() {

View File

@@ -48,12 +48,13 @@ public class ExecutorExportClient {
return msgId;
}
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
.forNode(node)
.run(RpcExportSampleData.newBuilder()
.setFileStorageId(fid.id())
.setSize(size)
.setCtFilter(ctFilter)
.setName(name)
.build());
}

View File

@@ -100,6 +100,7 @@ message RpcExportSampleData {
int64 fileStorageId = 1;
int32 size = 2;
string name = 3;
string ctFilter = 4;
}
message RpcDownloadSampleData {
string sampleSet = 1;

View File

@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
private final MqOutbox exportTasksOutbox;
private final Logger logger = LoggerFactory.getLogger(getClass());
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
this(crawlId, destId, size, name, -1);
public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
this(crawlId, destId, size, name, ctFilter,-1);
}
}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Export(FileStorageId crawlId, int size, String name) -> {
case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
"crawl-sample-export",
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
);
if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id(), size, name);
yield new Run(crawlId, storage.id(), size, ctFilter, name);
}
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
storageService.setFileStorageState(destId, FileStorageState.NEW);
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
yield new Run(crawlId, destId, size, name, newMsgId);
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
}
case Run(_, FileStorageId destId, _, _, long msgId) -> {
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
if (rsp.state() != MqMessageState.OK) {
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
@Override
public String describe() {
return "Export RSS/Atom feeds from crawl data";
return "Export sample crawl data";
}
@Inject

View File

@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
new ExportSampleDataActor.Export(
FileStorageId.of(request.getFileStorageId()),
request.getSize(),
request.getCtFilter(),
request.getName()
)
);

View File

@@ -53,6 +53,8 @@ dependencies {
implementation libs.commons.compress
implementation libs.commons.codec
implementation libs.jsoup
implementation libs.slop
implementation libs.jwarc

View File

@@ -3,11 +3,15 @@ package nu.marginalia.extractor;
import com.google.inject.Inject;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.slop.SlopCrawlDataRecord;
import nu.marginalia.slop.SlopTablePacker;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.nio.file.Files;
@@ -27,7 +31,7 @@ public class SampleDataExporter {
public SampleDataExporter(FileStorageService storageService) {
this.storageService = storageService;
}
public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException {
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -54,6 +58,7 @@ public class SampleDataExporter {
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
for (var item : entriesAll) {
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
@@ -72,7 +77,22 @@ public class SampleDataExporter {
Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue;
addFileToTar(stream, crawlDataPath, item.relPath());
if (StringUtils.isBlank(ctFilter)) {
addFileToTar(stream, crawlDataPath, item.relPath());
}
else /* filter != null */ {
boolean didFilterData = false;
try {
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
didFilterData = true;
addFileToTar(stream, crawlDataPath, item.relPath());
}
finally {
if (didFilterData) {
Files.deleteIfExists(crawlDataPath);
}
}
}
}
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
@@ -86,6 +106,46 @@ public class SampleDataExporter {
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
}
/** Filters the entries in the crawl data file based on the content type.
* @param crawlDataPath The path to the crawl data file.
* @param contentTypeFilter The content type to filter by.
* @return The path to the filtered crawl data file, or null if an error occurred.
*/
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType))
return true;
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
}
}
) {
while (reader.hasRemaining()) {
writer.write(reader.get());
}
SlopTablePacker.packToSlopZip(tempDir, tempFile);
}
finally {
FileUtils.deleteDirectory(tempDir.toFile());
}
return tempFile;
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file));

View File

@@ -92,7 +92,7 @@ public class ExportTasksMain extends ProcessMainClass {
termFrequencyExporter.export(request.crawlId, request.destId);
break;
case SAMPLE_DATA:
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name);
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.ctFilter, request.name);
break;
case ADJACENCIES:
websiteAdjacenciesCalculator.export();

View File

@@ -16,6 +16,7 @@ public class ExportTaskRequest {
public FileStorageId destId;
public int size;
public String name;
public String ctFilter;
public ExportTaskRequest(Task task) {
this.task = task;
@@ -42,12 +43,13 @@ public class ExportTaskRequest {
return request;
}
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, int size, String name) {
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, String ctFilter, int size, String name) {
ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA);
request.crawlId = crawlId;
request.destId = destId;
request.size = size;
request.name = name;
request.ctFilter = ctFilter;
return request;
}

View File

@@ -321,9 +321,10 @@ public class ControlNodeActionsService {
private Object exportSampleData(Request req, Response rsp) {
FileStorageId source = parseSourceFileStorageId(req.queryParams("source"));
int size = Integer.parseInt(req.queryParams("size"));
String ctFilter = req.queryParams("ctFilter");
String name = req.queryParams("name");
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, name);
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, ctFilter, name);
return "";
}

View File

@@ -35,6 +35,11 @@
<div><input type="text" name="size" id="size" pattern="\d+" /></div>
<small class="text-muted">How many domains to include in the sample set</small>
</div>
<div class="mb-3">
<label for="ctFilter">Content Type Filter</label>
<div><input type="text" name="ctFilter" id="ctFilter" /></div>
<small class="text-muted">If set, includes only documents with the specified content type value</small>
</div>
<div class="mb-3">
<label for="name">Name</label>
<div><input type="text" name="name" id="name" /></div>