1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

12 Commits

Author SHA1 Message Date
Viktor Lofgren
4279a7f1aa (url) Fix urlencoding issues with certain symbols
Minor fix with previously urlencoded codepoints, we need to account for the fact that they are encoded in hexadecimal.
2025-05-03 23:51:39 +02:00
Viktor Lofgren
251006d4f9 (url) Fix urlencoding issues with certain symbols
Problems primarily cropped up with sideloaded wikipedia articles, though the search engine has been returning inconsistently URLEncoded search results for a while, though browsers and servers have seemingly magically fixed the issues in many scenarios.

This addresses Issue #195 and Issue #131.
2025-05-03 23:48:45 +02:00
Viktor Lofgren
c3e99dc12a (service) Limit logging from ad hoc task heartbeats
Certain usage patterns of the ad hoc task heartbeats would lead to an incredible amount of log noise, as it would log each update.

Limit log updates to increments of 10% to avoid this problem.
2025-05-03 12:39:58 +02:00
Viktor
aaaa2de022 Merge pull request #196 from MarginaliaSearch/filter-export-sample-data
Add the ability to filter sample data based on content type
2025-05-02 13:23:49 +02:00
Viktor Lofgren
fc1388422a (actor) Add the ability to filter sample data based on content type
This will help in extracting relevant test sets for PDF processing.
2025-05-02 13:09:22 +02:00
Viktor Lofgren
b07080db16 (crawler) Don't retry requests when encountering UnknownHostException 2025-05-01 16:07:34 +02:00
Viktor Lofgren
e9d86dca4a (crawler) Add timeout to wrap-up phase of WarcInputBuffer. 2025-05-01 15:57:47 +02:00
Viktor Lofgren
1d693f0efa (build) Upgrade JIB to 3.4.5 2025-04-30 15:26:52 +02:00
Viktor Lofgren
5874a163dc (build) Upgrade gradle to 8.14 2025-04-30 15:26:37 +02:00
Viktor Lofgren
5ec7a1deab (crawler) Fix 80%-ish progress crawler stall
Since the crawl tasks are started in two phases, first when generating them in one loop, and then in a second loop that drains the task list; if the first loop contains a long-running crawl task that is triggered late, the rest of the crawl may halt until that task is finish.

Fixed the problem by draining and re-trying also in the first loop.
2025-04-29 12:23:51 +02:00
Viktor Lofgren
7fea2808ed (search) Fix error view
Fix rendering error when query was null

Fix border on error message.
2025-04-27 12:12:56 +02:00
Viktor Lofgren
8da74484f0 (search) Remove unused count modifier from the footer help 2025-04-27 12:08:34 +02:00
34 changed files with 292 additions and 143 deletions

View File

@@ -5,7 +5,7 @@ plugins {
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
// https://github.com/GoogleContainerTools/jib/issues/3347
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
}
group 'marginalia'
@@ -47,7 +47,7 @@ ext {
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
dockerImageTag='latest'
dockerImageRegistry='marginalia'
jibVersion = '3.4.4'
jibVersion = '3.4.5'
}
idea {

View File

@@ -1,16 +1,14 @@
package nu.marginalia.model;
import nu.marginalia.util.QueryParams;
import org.apache.commons.lang3.StringUtils;
import javax.annotation.Nullable;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.*;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;
public class EdgeUrl implements Serializable {
public final String proto;
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {
private static URI parseURI(String url) throws URISyntaxException {
try {
return new URI(urlencodeFixer(url));
return EdgeUriFactory.uriFromString(url);
} catch (URISyntaxException ex) {
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
}
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
}
}
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
Here on the Internet, standards are like the picture on the box of the frozen pizza,
and what you get is more like what's on the inside, we try to patch things instead,
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
like bad or missing URLEncoding
*/
public static String urlencodeFixer(String url) throws URISyntaxException {
var s = new StringBuilder();
String goodChars = "&.?:/-;+$#";
String hexChars = "0123456789abcdefABCDEF";
int pathIdx = findPathIdx(url);
if (pathIdx < 0) { // url looks like http://marginalia.nu
return url + "/";
}
s.append(url, 0, pathIdx);
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
int end = url.indexOf("#");
if (end < 0) end = url.length();
for (int i = pathIdx; i < end; i++) {
int c = url.charAt(i);
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
s.appendCodePoint(c);
} else if (c == '%' && i + 2 < end) {
int cn = url.charAt(i + 1);
int cnn = url.charAt(i + 2);
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
s.appendCodePoint(c);
} else {
s.append("%25");
}
} else {
s.append(String.format("%%%02X", c));
}
}
return s.toString();
}
private static int findPathIdx(String url) throws URISyntaxException {
int colonIdx = url.indexOf(':');
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
throw new URISyntaxException(url, "Lacking protocol");
}
return url.indexOf('/', colonIdx + 2);
}
public EdgeUrl(URI URI) {
try {
@@ -247,3 +193,127 @@ public class EdgeUrl implements Serializable {
}
}
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
Here on the Internet, standards are like the picture on the box of the frozen pizza,
and what you get is more like what's on the inside, we try to patch things instead,
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
like bad or missing URLEncoding
*/
class EdgeUriFactory {
public static URI uriFromString(String url) throws URISyntaxException {
var s = new StringBuilder();
int pathIdx = findPathIdx(url);
if (pathIdx < 0) { // url looks like http://marginalia.nu
return new URI(url + "/");
}
s.append(url, 0, pathIdx);
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
int end = url.indexOf("#");
if (end < 0) end = url.length();
int queryIdx = url.indexOf('?');
if (queryIdx < 0) queryIdx = end;
recombinePaths(s, url.substring(pathIdx, queryIdx));
if (queryIdx < end) {
recombineQueryString(s, url.substring(queryIdx + 1, end));
}
return new URI(s.toString());
}
private static void recombinePaths(StringBuilder sb, String path) {
if (path == null || path.isEmpty()) {
return;
}
String[] pathParts = StringUtils.split(path, '/');
if (pathParts.length == 0) {
sb.append('/');
return;
}
for (String pathPart : pathParts) {
if (pathPart.isEmpty()) continue;
if (needsUrlEncode(pathPart)) {
sb.append('/');
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
} else {
sb.append('/');
sb.append(pathPart);
}
}
}
private static void recombineQueryString(StringBuilder sb, String param) {
if (param == null || param.isEmpty()) {
return;
}
sb.append('?');
String[] pathParts = StringUtils.split(param, '&');
boolean first = true;
for (String pathPart : pathParts) {
if (pathPart.isEmpty()) continue;
if (first) {
first = false;
} else {
sb.append('&');
}
if (needsUrlEncode(pathPart)) {
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
} else {
sb.append(pathPart);
}
}
}
/** Test if the url element needs URL encoding.
* <p></p>
* Note we may have been given an already encoded path element,
* so we include % and + in the list of good characters
*/
private static boolean needsUrlEncode(String urlElement) {
for (int i = 0; i < urlElement.length(); i++) {
char c = urlElement.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
if ("-_.~+?=&".indexOf(c) >= 0) continue;
if (c == '%' && i + 2 < urlElement.length()) {
char c1 = urlElement.charAt(i + 1);
char c2 = urlElement.charAt(i + 2);
if (isHexDigit(c1) && isHexDigit(c2)) {
i += 2;
continue;
}
}
return true;
}
return false;
}
private static boolean isHexDigit(char c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
private static int findPathIdx(String url) throws URISyntaxException {
int colonIdx = url.indexOf(':');
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
throw new URISyntaxException(url, "Lacking protocol");
}
return url.indexOf('/', colonIdx + 3);
}
}

View File

@@ -1,6 +1,6 @@
package nu.marginalia.model;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
@@ -21,25 +21,25 @@ class EdgeUrlTest {
new EdgeUrl("https://memex.marginalia.nu/#here")
);
}
@Test
public void testParam() throws URISyntaxException {
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
}
@Test
void urlencodeFixer() throws URISyntaxException {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
void testUriFromString() throws URISyntaxException {
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.uriFromString("https://www.example.com/#heredoc").toString());
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.uriFromString("https://www.example.com/%-sign").toString());
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.uriFromString("https://www.example.com/%22-sign").toString());
Assertions.assertEquals("https://www.example.com/%0A+%22huh%22", EdgeUriFactory.uriFromString("https://www.example.com/\n \"huh\"").toString());
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.uriFromString("https://en.wikipedia.org/wiki/Sámi").toString());
}
@Test
void testParms() throws URISyntaxException {
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
}
}

View File

@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
*/
@Override
public void progress(String step, int stepProgress, int stepCount) {
int lastProgress = this.progress;
this.step = step;
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
if (this.progress / 10 != lastProgress / 10) {
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
}
}
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */

View File

@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
*/
@Override
public void progress(String step, int stepProgress, int stepCount) {
int lastProgress = this.progress;
this.step = step;
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
if (this.progress / 10 != lastProgress / 10) {
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
}
}
public void shutDown() {

View File

@@ -48,12 +48,13 @@ public class ExecutorExportClient {
return msgId;
}
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
.forNode(node)
.run(RpcExportSampleData.newBuilder()
.setFileStorageId(fid.id())
.setSize(size)
.setCtFilter(ctFilter)
.setName(name)
.build());
}

View File

@@ -100,6 +100,7 @@ message RpcExportSampleData {
int64 fileStorageId = 1;
int32 size = 2;
string name = 3;
string ctFilter = 4;
}
message RpcDownloadSampleData {
string sampleSet = 1;

View File

@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
private final MqOutbox exportTasksOutbox;
private final Logger logger = LoggerFactory.getLogger(getClass());
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
this(crawlId, destId, size, name, -1);
public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
this(crawlId, destId, size, name, ctFilter,-1);
}
}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Export(FileStorageId crawlId, int size, String name) -> {
case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
"crawl-sample-export",
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
);
if (storage == null) yield new Error("Bad storage id");
yield new Run(crawlId, storage.id(), size, name);
yield new Run(crawlId, storage.id(), size, ctFilter, name);
}
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
storageService.setFileStorageState(destId, FileStorageState.NEW);
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
yield new Run(crawlId, destId, size, name, newMsgId);
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
}
case Run(_, FileStorageId destId, _, _, long msgId) -> {
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
if (rsp.state() != MqMessageState.OK) {
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
@Override
public String describe() {
return "Export RSS/Atom feeds from crawl data";
return "Export sample crawl data";
}
@Inject

View File

@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
new ExportSampleDataActor.Export(
FileStorageId.of(request.getFileStorageId()),
request.getSize(),
request.getCtFilter(),
request.getName()
)
);

View File

@@ -264,17 +264,16 @@ public class CrawlerMain extends ProcessMainClass {
if (workLog.isJobFinished(crawlSpec.domain))
continue;
var task = new CrawlTask(
crawlSpec,
anchorTagsSource,
outputDir,
warcArchiver,
domainStateDb,
workLog);
var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
if (!trySubmitDeferredTask(task)) {
// Otherwise add to the taskList for deferred execution
// Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
retryQueue.drainTo(taskList);
taskList.removeIf(this::trySubmitDeferredTask);
// Then add this new task to the retry queue
taskList.add(task);
}
}

View File

@@ -51,6 +51,7 @@ import javax.net.ssl.SSLException;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.security.NoSuchAlgorithmException;
import java.time.Duration;
import java.time.Instant;
@@ -635,14 +636,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
@Override
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
return false;
}
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
return false;
}
return executionCount <= 3;
return switch (exception) {
case SocketTimeoutException ste -> false;
case SSLException ssle -> false;
case UnknownHostException uhe -> false;
default -> executionCount <= 3;
};
}
@Override

View File

@@ -57,6 +57,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
return new ErrorBuffer();
}
Instant start = Instant.now();
InputStream is = null;
try {
is = entity.getContent();
@@ -71,8 +72,25 @@ public abstract class WarcInputBuffer implements AutoCloseable {
}
}
finally {
// We're required to consume the stream to avoid leaking connections,
// but we also don't want to get stuck on slow or malicious connections
// forever, so we set a time limit on this phase and call abort() if it's exceeded.
try {
is.skip(Long.MAX_VALUE);
while (is != null) {
// Consume some data
if (is.skip(65536) == 0) {
// Note that skip may return 0 if the stream is empty
// or for other unspecified reasons, so we need to check
// with read() as well to determine if the stream is done
if (is.read() == -1)
is = null;
}
// Check if the time limit has been exceeded
else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
request.abort();
is = null;
}
}
}
catch (IOException e) {
// Ignore the exception

View File

@@ -53,6 +53,8 @@ dependencies {
implementation libs.commons.compress
implementation libs.commons.codec
implementation libs.jsoup
implementation libs.slop
implementation libs.jwarc

View File

@@ -3,11 +3,15 @@ package nu.marginalia.extractor;
import com.google.inject.Inject;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.slop.SlopCrawlDataRecord;
import nu.marginalia.slop.SlopTablePacker;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.nio.file.Files;
@@ -27,7 +31,7 @@ public class SampleDataExporter {
public SampleDataExporter(FileStorageService storageService) {
this.storageService = storageService;
}
public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException {
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -54,6 +58,7 @@ public class SampleDataExporter {
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
for (var item : entriesAll) {
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
@@ -72,7 +77,22 @@ public class SampleDataExporter {
Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue;
addFileToTar(stream, crawlDataPath, item.relPath());
if (StringUtils.isBlank(ctFilter)) {
addFileToTar(stream, crawlDataPath, item.relPath());
}
else /* filter != null */ {
boolean didFilterData = false;
try {
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
didFilterData = true;
addFileToTar(stream, crawlDataPath, item.relPath());
}
finally {
if (didFilterData) {
Files.deleteIfExists(crawlDataPath);
}
}
}
}
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
@@ -86,6 +106,46 @@ public class SampleDataExporter {
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
}
/** Filters the entries in the crawl data file based on the content type.
* @param crawlDataPath The path to the crawl data file.
* @param contentTypeFilter The content type to filter by.
* @return The path to the filtered crawl data file, or null if an error occurred.
*/
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType))
return true;
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
}
}
) {
while (reader.hasRemaining()) {
writer.write(reader.get());
}
SlopTablePacker.packToSlopZip(tempDir, tempFile);
}
finally {
FileUtils.deleteDirectory(tempDir.toFile());
}
return tempFile;
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file));

View File

@@ -92,7 +92,7 @@ public class ExportTasksMain extends ProcessMainClass {
termFrequencyExporter.export(request.crawlId, request.destId);
break;
case SAMPLE_DATA:
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name);
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.ctFilter, request.name);
break;
case ADJACENCIES:
websiteAdjacenciesCalculator.export();

View File

@@ -16,6 +16,7 @@ public class ExportTaskRequest {
public FileStorageId destId;
public int size;
public String name;
public String ctFilter;
public ExportTaskRequest(Task task) {
this.task = task;
@@ -42,12 +43,13 @@ public class ExportTaskRequest {
return request;
}
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, int size, String name) {
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, String ctFilter, int size, String name) {
ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA);
request.crawlId = crawlId;
request.destId = destId;
request.size = size;
request.name = name;
request.ctFilter = ctFilter;
return request;
}

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -5,7 +5,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'gg.jte.gradle' version '3.1.15'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -26,7 +26,7 @@
<!-- Main content -->
<main class="flex-1 p-4 max-w-2xl space-y-4">
<div class="border dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
<div class="flex space-x-3 place-items-baseline">
<i class="fa fa-circle-exclamation text-red-800"></i>
<div class="grow">${model.errorTitle()}</div>

View File

@@ -80,10 +80,6 @@
<tr><td>rank&gt;50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
<tr><td>rank&lt;50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
<tr><td>count&gt;10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
<tr><td>count&lt;10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>

View File

@@ -7,7 +7,7 @@
<form class="flex-1 max-w-2xl" action="/search">
<div class="flex">
@if (query.isBlank())
@if (query != null && query.isBlank())
<%-- Add autofocus if the query is blank --%>
<input type="text"
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"

View File

@@ -2,7 +2,7 @@ plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -2,7 +2,7 @@ plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -321,9 +321,10 @@ public class ControlNodeActionsService {
private Object exportSampleData(Request req, Response rsp) {
FileStorageId source = parseSourceFileStorageId(req.queryParams("source"));
int size = Integer.parseInt(req.queryParams("size"));
String ctFilter = req.queryParams("ctFilter");
String name = req.queryParams("name");
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, name);
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, ctFilter, name);
return "";
}

View File

@@ -35,6 +35,11 @@
<div><input type="text" name="size" id="size" pattern="\d+" /></div>
<small class="text-muted">How many domains to include in the sample set</small>
</div>
<div class="mb-3">
<label for="ctFilter">Content Type Filter</label>
<div><input type="text" name="ctFilter" id="ctFilter" /></div>
<small class="text-muted">If set, includes only documents with the specified content type value</small>
</div>
<div class="mb-3">
<label for="name">Name</label>
<div><input type="text" name="name" id="name" /></div>

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {

View File

@@ -3,7 +3,7 @@ plugins {
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.4'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
java {

View File

@@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists