1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

9 Commits

Author SHA1 Message Date
Viktor Lofgren
251006d4f9 (url) Fix urlencoding issues with certain symbols
Problems primarily cropped up with sideloaded wikipedia articles, though the search engine has been returning inconsistently URLEncoded search results for a while, though browsers and servers have seemingly magically fixed the issues in many scenarios.

This addresses Issue #195 and Issue #131.
2025-05-03 23:48:45 +02:00
Viktor Lofgren
c3e99dc12a (service) Limit logging from ad hoc task heartbeats
Certain usage patterns of the ad hoc task heartbeats would lead to an incredible amount of log noise, as it would log each update.

Limit log updates to increments of 10% to avoid this problem.
2025-05-03 12:39:58 +02:00
Viktor
aaaa2de022 Merge pull request #196 from MarginaliaSearch/filter-export-sample-data
Add the ability to filter sample data based on content type
2025-05-02 13:23:49 +02:00
Viktor Lofgren
fc1388422a (actor) Add the ability to filter sample data based on content type
This will help in extracting relevant test sets for PDF processing.
2025-05-02 13:09:22 +02:00
Viktor Lofgren
b07080db16 (crawler) Don't retry requests when encountering UnknownHostException 2025-05-01 16:07:34 +02:00
Viktor Lofgren
e9d86dca4a (crawler) Add timeout to wrap-up phase of WarcInputBuffer. 2025-05-01 15:57:47 +02:00
Viktor Lofgren
1d693f0efa (build) Upgrade JIB to 3.4.5 2025-04-30 15:26:52 +02:00
Viktor Lofgren
5874a163dc (build) Upgrade gradle to 8.14 2025-04-30 15:26:37 +02:00
Viktor Lofgren
5ec7a1deab (crawler) Fix 80%-ish progress crawler stall
Since the crawl tasks are started in two phases, first when generating them in one loop, and then in a second loop that drains the task list; if the first loop contains a long-running crawl task that is triggered late, the rest of the crawl may halt until that task is finish.

Fixed the problem by draining and re-trying also in the first loop.
2025-04-29 12:23:51 +02:00
31 changed files with 286 additions and 137 deletions

View File

@@ -5,7 +5,7 @@ plugins {
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly // This is a workaround for a bug in the Jib plugin that causes it to stall randomly
// https://github.com/GoogleContainerTools/jib/issues/3347 // https://github.com/GoogleContainerTools/jib/issues/3347
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false) id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
} }
group 'marginalia' group 'marginalia'
@@ -47,7 +47,7 @@ ext {
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24' dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
dockerImageTag='latest' dockerImageTag='latest'
dockerImageRegistry='marginalia' dockerImageRegistry='marginalia'
jibVersion = '3.4.4' jibVersion = '3.4.5'
} }
idea { idea {

View File

@@ -1,16 +1,14 @@
package nu.marginalia.model; package nu.marginalia.model;
import nu.marginalia.util.QueryParams; import nu.marginalia.util.QueryParams;
import org.apache.commons.lang3.StringUtils;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.io.Serializable; import java.io.Serializable;
import java.net.MalformedURLException; import java.net.*;
import java.net.URI; import java.nio.charset.StandardCharsets;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.regex.Pattern;
public class EdgeUrl implements Serializable { public class EdgeUrl implements Serializable {
public final String proto; public final String proto;
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {
private static URI parseURI(String url) throws URISyntaxException { private static URI parseURI(String url) throws URISyntaxException {
try { try {
return new URI(urlencodeFixer(url)); return EdgeUriFactory.uriFromString(url);
} catch (URISyntaxException ex) { } catch (URISyntaxException ex) {
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage()); throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
} }
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
} }
} }
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
Here on the Internet, standards are like the picture on the box of the frozen pizza,
and what you get is more like what's on the inside, we try to patch things instead,
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
like bad or missing URLEncoding
*/
public static String urlencodeFixer(String url) throws URISyntaxException {
var s = new StringBuilder();
String goodChars = "&.?:/-;+$#";
String hexChars = "0123456789abcdefABCDEF";
int pathIdx = findPathIdx(url);
if (pathIdx < 0) { // url looks like http://marginalia.nu
return url + "/";
}
s.append(url, 0, pathIdx);
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
int end = url.indexOf("#");
if (end < 0) end = url.length();
for (int i = pathIdx; i < end; i++) {
int c = url.charAt(i);
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
s.appendCodePoint(c);
} else if (c == '%' && i + 2 < end) {
int cn = url.charAt(i + 1);
int cnn = url.charAt(i + 2);
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
s.appendCodePoint(c);
} else {
s.append("%25");
}
} else {
s.append(String.format("%%%02X", c));
}
}
return s.toString();
}
private static int findPathIdx(String url) throws URISyntaxException {
int colonIdx = url.indexOf(':');
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
throw new URISyntaxException(url, "Lacking protocol");
}
return url.indexOf('/', colonIdx + 2);
}
public EdgeUrl(URI URI) { public EdgeUrl(URI URI) {
try { try {
@@ -247,3 +193,123 @@ public class EdgeUrl implements Serializable {
} }
} }
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
Here on the Internet, standards are like the picture on the box of the frozen pizza,
and what you get is more like what's on the inside, we try to patch things instead,
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
like bad or missing URLEncoding
*/
class EdgeUriFactory {
public static URI uriFromString(String url) throws URISyntaxException {
var s = new StringBuilder();
int pathIdx = findPathIdx(url);
if (pathIdx < 0) { // url looks like http://marginalia.nu
return new URI(url + "/");
}
s.append(url, 0, pathIdx);
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
int end = url.indexOf("#");
if (end < 0) end = url.length();
int queryIdx = url.indexOf('?');
if (queryIdx < 0) queryIdx = end;
recombinePaths(s, url.substring(pathIdx, queryIdx));
if (queryIdx < end) {
recombineQueryString(s, url.substring(queryIdx + 1, end));
}
return new URI(s.toString());
}
private static void recombinePaths(StringBuilder sb, String path) {
if (path == null || path.isEmpty()) {
return;
}
String[] pathParts = StringUtils.split(path, '/');
if (pathParts.length == 0) {
sb.append('/');
return;
}
for (String pathPart : pathParts) {
if (pathPart.isEmpty()) continue;
if (needsUrlEncode(pathPart)) {
sb.append('/');
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
} else {
sb.append('/');
sb.append(pathPart);
}
}
}
private static void recombineQueryString(StringBuilder sb, String param) {
if (param == null || param.isEmpty()) {
return;
}
sb.append('?');
String[] pathParts = StringUtils.split(param, '&');
boolean first = true;
for (String pathPart : pathParts) {
if (pathPart.isEmpty()) continue;
if (first) {
first = false;
} else {
sb.append('&');
}
if (needsUrlEncode(pathPart)) {
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
} else {
sb.append(pathPart);
}
}
}
/** Test if the url element needs URL encoding.
* <p></p>
* Note we may have been given an already encoded path element,
* so we include % and + in the list of good characters
*/
private static boolean needsUrlEncode(String urlElement) {
for (int i = 0; i < urlElement.length(); i++) {
char c = urlElement.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
if ("-_.~+?=&".indexOf(c) >= 0) continue;
if (c == '%' && i + 2 < urlElement.length()) {
char c1 = urlElement.charAt(i + 1);
char c2 = urlElement.charAt(i + 2);
if (c1 >= '0' && c1 <= '9' && c2 >= '0' && c2 <= '9') {
i += 2;
continue;
}
}
return true;
}
return false;
}
private static int findPathIdx(String url) throws URISyntaxException {
int colonIdx = url.indexOf(':');
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
throw new URISyntaxException(url, "Lacking protocol");
}
return url.indexOf('/', colonIdx + 3);
}
}

View File

@@ -1,6 +1,6 @@
package nu.marginalia.model; package nu.marginalia.model;
import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.net.URISyntaxException; import java.net.URISyntaxException;
@@ -21,25 +21,25 @@ class EdgeUrlTest {
new EdgeUrl("https://memex.marginalia.nu/#here") new EdgeUrl("https://memex.marginalia.nu/#here")
); );
} }
@Test @Test
public void testParam() throws URISyntaxException { void testUriFromString() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString()); Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.uriFromString("https://www.example.com/#heredoc").toString());
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString()); Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.uriFromString("https://www.example.com/%-sign").toString());
} Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.uriFromString("https://www.example.com/%22-sign").toString());
@Test Assertions.assertEquals("https://www.example.com/%0A+%22huh%22", EdgeUriFactory.uriFromString("https://www.example.com/\n \"huh\"").toString());
void urlencodeFixer() throws URISyntaxException { Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.uriFromString("https://en.wikipedia.org/wiki/Sámi").toString());
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
} }
@Test @Test
void testParms() throws URISyntaxException { void testParms() throws URISyntaxException {
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123")); Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123")); Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123")); Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123")); Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123")); Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
} }
} }

View File

@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
*/ */
@Override @Override
public void progress(String step, int stepProgress, int stepCount) { public void progress(String step, int stepProgress, int stepCount) {
int lastProgress = this.progress;
this.step = step; this.step = step;
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount); this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
logger.info("ProcessTask {} progress: {}%", taskBase, progress); if (this.progress / 10 != lastProgress / 10) {
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
}
} }
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */ /** Wrap a collection to provide heartbeat progress updates as it's iterated through */

View File

@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
*/ */
@Override @Override
public void progress(String step, int stepProgress, int stepCount) { public void progress(String step, int stepProgress, int stepCount) {
int lastProgress = this.progress;
this.step = step; this.step = step;
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount); this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
logger.info("ServiceTask {} progress: {}%", taskBase, progress); if (this.progress / 10 != lastProgress / 10) {
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
}
} }
public void shutDown() { public void shutDown() {

View File

@@ -48,12 +48,13 @@ public class ExecutorExportClient {
return msgId; return msgId;
} }
public void exportSampleData(int node, FileStorageId fid, int size, String name) { public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData) channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
.forNode(node) .forNode(node)
.run(RpcExportSampleData.newBuilder() .run(RpcExportSampleData.newBuilder()
.setFileStorageId(fid.id()) .setFileStorageId(fid.id())
.setSize(size) .setSize(size)
.setCtFilter(ctFilter)
.setName(name) .setName(name)
.build()); .build());
} }

View File

@@ -100,6 +100,7 @@ message RpcExportSampleData {
int64 fileStorageId = 1; int64 fileStorageId = 1;
int32 size = 2; int32 size = 2;
string name = 3; string name = 3;
string ctFilter = 4;
} }
message RpcDownloadSampleData { message RpcDownloadSampleData {
string sampleSet = 1; string sampleSet = 1;

View File

@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
private final MqOutbox exportTasksOutbox; private final MqOutbox exportTasksOutbox;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {} public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep { public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) { public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
this(crawlId, destId, size, name, -1); this(crawlId, destId, size, name, ctFilter,-1);
} }
} }
@Override @Override
public ActorStep transition(ActorStep self) throws Exception { public ActorStep transition(ActorStep self) throws Exception {
return switch(self) { return switch(self) {
case Export(FileStorageId crawlId, int size, String name) -> { case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
var storage = storageService.allocateStorage(FileStorageType.EXPORT, var storage = storageService.allocateStorage(FileStorageType.EXPORT,
"crawl-sample-export", "crawl-sample-export",
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now() "Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
); );
if (storage == null) yield new Error("Bad storage id"); if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id(), size, name); yield new Run(crawlId, storage.id(), size, ctFilter, name);
} }
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> { case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
storageService.setFileStorageState(destId, FileStorageState.NEW); storageService.setFileStorageState(destId, FileStorageState.NEW);
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name)); long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
yield new Run(crawlId, destId, size, name, newMsgId); yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
} }
case Run(_, FileStorageId destId, _, _, long msgId) -> { case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId); var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
if (rsp.state() != MqMessageState.OK) { if (rsp.state() != MqMessageState.OK) {
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
@Override @Override
public String describe() { public String describe() {
return "Export RSS/Atom feeds from crawl data"; return "Export sample crawl data";
} }
@Inject @Inject

View File

@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
new ExportSampleDataActor.Export( new ExportSampleDataActor.Export(
FileStorageId.of(request.getFileStorageId()), FileStorageId.of(request.getFileStorageId()),
request.getSize(), request.getSize(),
request.getCtFilter(),
request.getName() request.getName()
) )
); );

View File

@@ -264,17 +264,16 @@ public class CrawlerMain extends ProcessMainClass {
if (workLog.isJobFinished(crawlSpec.domain)) if (workLog.isJobFinished(crawlSpec.domain))
continue; continue;
var task = new CrawlTask( var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
crawlSpec,
anchorTagsSource,
outputDir,
warcArchiver,
domainStateDb,
workLog);
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM // Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
if (!trySubmitDeferredTask(task)) { if (!trySubmitDeferredTask(task)) {
// Otherwise add to the taskList for deferred execution
// Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
retryQueue.drainTo(taskList);
taskList.removeIf(this::trySubmitDeferredTask);
// Then add this new task to the retry queue
taskList.add(task); taskList.add(task);
} }
} }

View File

@@ -51,6 +51,7 @@ import javax.net.ssl.SSLException;
import java.io.IOException; import java.io.IOException;
import java.net.SocketTimeoutException; import java.net.SocketTimeoutException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.time.Duration; import java.time.Duration;
import java.time.Instant; import java.time.Instant;
@@ -635,14 +636,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
@Override @Override
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) { public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable return switch (exception) {
return false; case SocketTimeoutException ste -> false;
} case SSLException ssle -> false;
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable case UnknownHostException uhe -> false;
return false; default -> executionCount <= 3;
} };
return executionCount <= 3;
} }
@Override @Override

View File

@@ -57,6 +57,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
return new ErrorBuffer(); return new ErrorBuffer();
} }
Instant start = Instant.now();
InputStream is = null; InputStream is = null;
try { try {
is = entity.getContent(); is = entity.getContent();
@@ -71,8 +72,25 @@ public abstract class WarcInputBuffer implements AutoCloseable {
} }
} }
finally { finally {
// We're required to consume the stream to avoid leaking connections,
// but we also don't want to get stuck on slow or malicious connections
// forever, so we set a time limit on this phase and call abort() if it's exceeded.
try { try {
is.skip(Long.MAX_VALUE); while (is != null) {
// Consume some data
if (is.skip(65536) == 0) {
// Note that skip may return 0 if the stream is empty
// or for other unspecified reasons, so we need to check
// with read() as well to determine if the stream is done
if (is.read() == -1)
is = null;
}
// Check if the time limit has been exceeded
else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
request.abort();
is = null;
}
}
} }
catch (IOException e) { catch (IOException e) {
// Ignore the exception // Ignore the exception

View File

@@ -53,6 +53,8 @@ dependencies {
implementation libs.commons.compress implementation libs.commons.compress
implementation libs.commons.codec implementation libs.commons.codec
implementation libs.jsoup implementation libs.jsoup
implementation libs.slop
implementation libs.jwarc

View File

@@ -3,11 +3,15 @@ package nu.marginalia.extractor;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry; import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.slop.SlopCrawlDataRecord;
import nu.marginalia.slop.SlopTablePacker;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@@ -27,7 +31,7 @@ public class SampleDataExporter {
public SampleDataExporter(FileStorageService storageService) { public SampleDataExporter(FileStorageService storageService) {
this.storageService = storageService; this.storageService = storageService;
} }
public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException { public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId); FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath(); Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -54,6 +58,7 @@ public class SampleDataExporter {
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log", Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) { try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
for (var item : entriesAll) { for (var item : entriesAll) {
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n"); bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
@@ -72,7 +77,22 @@ public class SampleDataExporter {
Path crawlDataPath = inputDir.resolve(item.relPath()); Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue; if (!Files.exists(crawlDataPath)) continue;
addFileToTar(stream, crawlDataPath, item.relPath()); if (StringUtils.isBlank(ctFilter)) {
addFileToTar(stream, crawlDataPath, item.relPath());
}
else /* filter != null */ {
boolean didFilterData = false;
try {
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
didFilterData = true;
addFileToTar(stream, crawlDataPath, item.relPath());
}
finally {
if (didFilterData) {
Files.deleteIfExists(crawlDataPath);
}
}
}
} }
addFileToTar(stream, newCrawlerLogFile, "crawler.log"); addFileToTar(stream, newCrawlerLogFile, "crawler.log");
@@ -86,6 +106,46 @@ public class SampleDataExporter {
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
} }
/** Filters the entries in the crawl data file based on the content type.
* @param crawlDataPath The path to the crawl data file.
* @param contentTypeFilter The content type to filter by.
* @return The path to the filtered crawl data file, or null if an error occurred.
*/
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType))
return true;
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
}
}
) {
while (reader.hasRemaining()) {
writer.write(reader.get());
}
SlopTablePacker.packToSlopZip(tempDir, tempFile);
}
finally {
FileUtils.deleteDirectory(tempDir.toFile());
}
return tempFile;
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException { private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName); var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file)); entry.setSize(Files.size(file));

View File

@@ -92,7 +92,7 @@ public class ExportTasksMain extends ProcessMainClass {
termFrequencyExporter.export(request.crawlId, request.destId); termFrequencyExporter.export(request.crawlId, request.destId);
break; break;
case SAMPLE_DATA: case SAMPLE_DATA:
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name); sampleDataExporter.export(request.crawlId, request.destId, request.size, request.ctFilter, request.name);
break; break;
case ADJACENCIES: case ADJACENCIES:
websiteAdjacenciesCalculator.export(); websiteAdjacenciesCalculator.export();

View File

@@ -16,6 +16,7 @@ public class ExportTaskRequest {
public FileStorageId destId; public FileStorageId destId;
public int size; public int size;
public String name; public String name;
public String ctFilter;
public ExportTaskRequest(Task task) { public ExportTaskRequest(Task task) {
this.task = task; this.task = task;
@@ -42,12 +43,13 @@ public class ExportTaskRequest {
return request; return request;
} }
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, int size, String name) { public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, String ctFilter, int size, String name) {
ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA); ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA);
request.crawlId = crawlId; request.crawlId = crawlId;
request.destId = destId; request.destId = destId;
request.size = size; request.size = size;
request.name = name; request.name = name;
request.ctFilter = ctFilter;
return request; return request;
} }

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
java { java {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
application { application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
application { application {

View File

@@ -5,7 +5,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
application { application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'gg.jte.gradle' version '3.1.15' id 'gg.jte.gradle' version '3.1.15'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
application { application {

View File

@@ -2,7 +2,7 @@ plugins {
id 'java' id 'java'
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
java { java {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
application { application {

View File

@@ -2,7 +2,7 @@ plugins {
id 'java' id 'java'
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
java { java {

View File

@@ -321,9 +321,10 @@ public class ControlNodeActionsService {
private Object exportSampleData(Request req, Response rsp) { private Object exportSampleData(Request req, Response rsp) {
FileStorageId source = parseSourceFileStorageId(req.queryParams("source")); FileStorageId source = parseSourceFileStorageId(req.queryParams("source"));
int size = Integer.parseInt(req.queryParams("size")); int size = Integer.parseInt(req.queryParams("size"));
String ctFilter = req.queryParams("ctFilter");
String name = req.queryParams("name"); String name = req.queryParams("name");
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, name); exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, ctFilter, name);
return ""; return "";
} }

View File

@@ -35,6 +35,11 @@
<div><input type="text" name="size" id="size" pattern="\d+" /></div> <div><input type="text" name="size" id="size" pattern="\d+" /></div>
<small class="text-muted">How many domains to include in the sample set</small> <small class="text-muted">How many domains to include in the sample set</small>
</div> </div>
<div class="mb-3">
<label for="ctFilter">Content Type Filter</label>
<div><input type="text" name="ctFilter" id="ctFilter" /></div>
<small class="text-muted">If set, includes only documents with the specified content type value</small>
</div>
<div class="mb-3"> <div class="mb-3">
<label for="name">Name</label> <label for="name">Name</label>
<div><input type="text" name="name" id="name" /></div> <div><input type="text" name="name" id="name" /></div>

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
application { application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
application { application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
application { application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application' id 'application'
id 'jvm-test-suite' id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4' id 'com.google.cloud.tools.jib' version '3.4.5'
} }
java { java {

View File

@@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
zipStoreBase=GRADLE_USER_HOME zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists zipStorePath=wrapper/dists