1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

15 Commits

Author SHA1 Message Date
Viktor Lofgren
18700e1919 (sample) Fix bug where slop files would not be saved despite containing data 2025-05-06 13:38:21 +02:00
Viktor Lofgren
120b431998 (crawler) Fix outdated assumptions about content types and http status codes always being 200 when good.
We now sometimes get 206 when good.
2025-05-06 13:18:30 +02:00
Viktor Lofgren
71dad99326 (crawler) Revisitor should not demand a 200, but support a 206 as well 2025-05-06 13:11:52 +02:00
Viktor Lofgren
c1e8afdf86 (crawler) Remove domains from pending crawl tasks queue when retrying 2025-05-06 12:56:30 +02:00
Viktor Lofgren
fa32dddc24 (sample-actor) Make content type matching lenient with regard to ct parameters such as charset 2025-05-06 12:48:09 +02:00
Viktor Lofgren
a266fcbf30 (sample-actor) Clean up debris from previous runs to avoid errors on re-runs 2025-05-05 13:16:37 +02:00
Viktor Lofgren
6e47e58e0e (sample-actor) Add progress tracking to sample export actor 2025-05-05 13:04:14 +02:00
Viktor Lofgren
9dc43d8b4a (sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable. 2025-05-05 12:56:12 +02:00
Viktor Lofgren
83967e3305 (sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable. 2025-05-05 12:50:21 +02:00
Viktor Lofgren
4db980a291 (jooby-service) Set an upper limit on the number of worker threads 2025-05-05 12:40:31 +02:00
Viktor Lofgren
089b177868 (deploy) Executor partition 4. 2025-05-05 12:21:27 +02:00
Viktor Lofgren
9c8e9a68d5 (deploy) Executor partition 4. 2025-05-05 12:00:05 +02:00
Viktor Lofgren
413d5cc788 (url, minor) Fix typo in test 2025-05-04 16:28:30 +02:00
Viktor Lofgren
58539b92ac (search) Don't show addresses with URLencoding in the UI 2025-05-04 16:26:39 +02:00
Viktor Lofgren
fe72f16df1 (url) Add additional tests for parameter handling 2025-05-04 16:23:39 +02:00
11 changed files with 94 additions and 50 deletions

View File

@@ -121,6 +121,28 @@ public class EdgeUrl implements Serializable {
return sb.toString(); return sb.toString();
} }
public String toDisplayString() {
StringBuilder sb = new StringBuilder(256);
sb.append(proto);
sb.append("://");
sb.append(domain);
if (port != null) {
sb.append(':');
sb.append(port);
}
sb.append(path);
if (param != null) {
sb.append('?').append(param);
}
return sb.toString();
}
public String dir() { public String dir() {
return path.replaceAll("/[^/]+$", "/"); return path.replaceAll("/[^/]+$", "/");
} }

View File

@@ -59,18 +59,31 @@ class EdgeUrlTest {
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString()); Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString()); Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
} }
@Test @Test
void testParms() throws URISyntaxException { void testParms() throws URISyntaxException {
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param); Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param); Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param); Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param); Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param); Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param); Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param); Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param); Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
} }

View File

@@ -122,6 +122,11 @@ public class JoobyService {
// single digit percentage difference since HTML already compresses very well with level = 1. // single digit percentage difference since HTML already compresses very well with level = 1.
options.setCompressionLevel(1); options.setCompressionLevel(1);
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
// scenario
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
jooby.setServerOptions(options); jooby.setServerOptions(options);

View File

@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
// We don't have a lock, so we can't run this task // We don't have a lock, so we can't run this task
// we return to avoid blocking the pool for too long // we return to avoid blocking the pool for too long
if (lock.isEmpty()) { if (lock.isEmpty()) {
if (retryQueue.remainingCapacity() > 0) { pendingCrawlTasks.remove(domain);
// Sleep a moment to avoid busy looping via the retry queue
// in the case when few tasks remain and almost all are ineligible for
// immediate restart
Thread.sleep(5);
}
retryQueue.put(this); retryQueue.put(this);
return; return;
} }

View File

@@ -74,7 +74,7 @@ public class CrawlerRevisitor {
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
// unlikely to produce anything meaningful for us. // unlikely to produce anything meaningful for us.
if (doc.httpStatus != 200) if (doc.httpStatus != 200 && doc.httpStatus != 206)
continue; continue;
if (!doc.hasBody()) if (!doc.hasBody())
continue; continue;

View File

@@ -58,7 +58,7 @@ public record DocumentWithReference(
if (null == doc) if (null == doc)
return ContentTags.empty(); return ContentTags.empty();
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200) if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
return ContentTags.empty(); return ContentTags.empty();
String lastmod = doc.getLastModified(); String lastmod = doc.getLastModified();

View File

@@ -1,5 +1,7 @@
package nu.marginalia; package nu.marginalia;
import org.apache.commons.lang3.StringUtils;
import java.util.Set; import java.util.Set;
public class ContentTypes { public class ContentTypes {
@@ -11,9 +13,9 @@ public class ContentTypes {
"text/plain"); "text/plain");
public static boolean isAccepted(String contentTypeHeader) { public static boolean isAccepted(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase(); String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
for (var type : acceptedContentTypes) { for (var type : acceptedContentTypes) {
if (lcHeader.startsWith(type)) { if (lcHeader.equals(type)) {
return true; return true;
} }
} }
@@ -21,7 +23,7 @@ public class ContentTypes {
} }
public static boolean isBinary(String contentTypeHeader) { public static boolean isBinary(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase(); String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
return lcHeader.startsWith("application/pdf"); return lcHeader.startsWith("application/pdf");
} }

View File

@@ -277,7 +277,8 @@ public record SlopCrawlDataRecord(String domain,
try (var table = new SlopTable(path)) { try (var table = new SlopTable(path)) {
ShortColumn.Reader statusReader = statusColumn.open(table); ShortColumn.Reader statusReader = statusColumn.open(table);
while (statusReader.hasRemaining()) { while (statusReader.hasRemaining()) {
if (statusReader.get() == 200) { int status = statusReader.get();
if (status == 200 || status == 206) {
cnt++; cnt++;
} }
} }

View File

@@ -1,6 +1,7 @@
package nu.marginalia.extractor; package nu.marginalia.extractor;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry; import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.slop.SlopCrawlDataRecord; import nu.marginalia.slop.SlopCrawlDataRecord;
@@ -20,17 +21,18 @@ import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions; import java.nio.file.attribute.PosixFilePermissions;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.*;
import java.util.Collections;
import java.util.List;
public class SampleDataExporter { public class SampleDataExporter {
private final FileStorageService storageService; private final FileStorageService storageService;
private final ProcessHeartbeat processHeartbeat;
@Inject @Inject
public SampleDataExporter(FileStorageService storageService) { public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
this.storageService = storageService; this.storageService = storageService;
this.processHeartbeat = processHeartbeat;
} }
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException { public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
FileStorage destStorage = storageService.getStorage(destId); FileStorage destStorage = storageService.getStorage(destId);
Path inputDir = storageService.getStorage(crawlId).asPath(); Path inputDir = storageService.getStorage(crawlId).asPath();
@@ -59,12 +61,6 @@ public class SampleDataExporter {
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log", Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
for (var item : entriesAll) {
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
}
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json", Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n"); Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
@@ -72,24 +68,31 @@ public class SampleDataExporter {
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar", var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) { try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
for (var item : entriesAll) { var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
) {
for (var item : hb.wrap("Scanning", entriesAll)) {
Path crawlDataPath = inputDir.resolve(item.relPath()); Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue; if (!Files.exists(crawlDataPath)) continue;
if (StringUtils.isBlank(ctFilter)) { if (StringUtils.isBlank(ctFilter)) {
addFileToTar(stream, crawlDataPath, item.relPath()); addFileToTar(stream, crawlDataPath, item.relPath());
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
} }
else /* filter != null */ { else /* filter != null */ {
boolean didFilterData = false; Path filteredData = null;
try { try {
crawlDataPath = filterEntries(crawlDataPath, ctFilter); filteredData = filterEntries(crawlDataPath, ctFilter);
didFilterData = true; addFileToTar(stream, filteredData, item.relPath());
addFileToTar(stream, crawlDataPath, item.relPath()); logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
catch (NoSuchElementException ex) {
// Ignore
} }
finally { finally {
if (didFilterData) { if (filteredData != null) {
Files.deleteIfExists(crawlDataPath); Files.deleteIfExists(filteredData);
} }
} }
} }
@@ -106,34 +109,36 @@ public class SampleDataExporter {
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
} }
/** Filters the entries in the crawl data file based on the content type. /** Filters the entries in the crawl data file based on the content type. */
* @param crawlDataPath The path to the crawl data file. private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
* @param contentTypeFilter The content type to filter by.
* @return The path to the filtered crawl data file, or null if an error occurred.
*/
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered"); Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip"); Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
// We may have debris from a previous run, so let's clean it up
if (Files.isDirectory(tempDir)) {
FileUtils.deleteDirectory(tempDir.toFile());
}
Files.createDirectory(tempDir); Files.createDirectory(tempDir);
try (var writer = new SlopCrawlDataRecord.Writer(tempDir); try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) { var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override @Override
public boolean filter(String url, int status, String contentType) { public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType)) return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
return true; || contentType.startsWith("x-marginalia/"); // metadata records
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
} }
} }
) { ) {
boolean wroteEntry = false;
while (reader.hasRemaining()) { while (reader.hasRemaining()) {
writer.write(reader.get()); var entry = reader.get();
writer.write(entry);
wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
}
if (!wroteEntry) {
throw new NoSuchElementException("No relevant entries");
} }
SlopTablePacker.packToSlopZip(tempDir, tempFile); SlopTablePacker.packToSlopZip(tempDir, tempFile);

View File

@@ -180,7 +180,7 @@ public class UrlDetails implements Comparable<UrlDetails> {
* semantically meaningful codepoints into entity codes */ * semantically meaningful codepoints into entity codes */
public String displayUrl() { public String displayUrl() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
String urlStr = url.toString(); String urlStr = url.toDisplayString();
for (int i = 0; i < urlStr.length(); i++) { for (int i = 0; i < urlStr.length(); i++) {
char c = urlStr.charAt(i); char c = urlStr.charAt(i);

View File

@@ -4,4 +4,6 @@
2025-01-07: Deploy executor. 2025-01-07: Deploy executor.
2025-04-24: Deploy executor. 2025-04-24: Deploy executor.
2025-04-24: Deploy assistant. 2025-04-24: Deploy assistant.
2025-04-24: Deploy qs, search and api-services. 2025-05-04: Deploy qs, search and api-services.
2025-05-05: Deploy executor partition 4.
2025-05-05: Deploy control.