1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

22 Commits

Author SHA1 Message Date
Viktor Lofgren
b513809710 (*) Stopgap fix for metrics server initialization errors bringing down services 2025-02-14 17:09:48 +01:00
Viktor Lofgren
7519b28e21 (search) Correct exception from misbehaving bots feeding invalid urls 2025-02-14 17:05:24 +01:00
Viktor Lofgren
3eac4dd57f (search) Correct exception in error handler when page is missing 2025-02-14 17:00:21 +01:00
Viktor Lofgren
4c2810720a (search) Add redirect handler for full URLs in the /site endpoint 2025-02-14 16:31:11 +01:00
Viktor Lofgren
8480ba8daa (live-capture) Code cleanup 2025-02-04 14:05:36 +01:00
Viktor Lofgren
fbba392491 (live-capture) Send a UA-string from the browserless fetcher as well
The change also introduces a somewhat convoluted wiremock test to intercept and verify that these headers are in fact sent
2025-02-04 13:36:49 +01:00
Viktor Lofgren
530eb35949 (update-rss) Do not fail the feed fetcher control actor if it takes a long time to complete. 2025-02-03 11:35:32 +01:00
Viktor Lofgren
c2dd2175a2 (search) Add new query expansion rule contracting WORD NUM pairs into WORD-NUM and WORDNUM 2025-02-01 13:13:30 +01:00
Viktor Lofgren
b8581b0f56 (crawler) Safe sanitization of headers during warc->slop conversion
The warc->slop converter was rejecting some items because they had headers that were representable in the Warc code's MessageHeader map implementation, but illegal in the HttpHeaders' implementation.

Fixing this by manually filtering these out.  Ostensibly the constructor has a filtering predicate, but this annoyingly runs too late and fails to prevent the problem.
2025-01-31 12:47:42 +01:00
Viktor Lofgren
2ea34767d8 (crawler) Use the response URL when resolving relative links
The crawler was incorrectly using the request URL as the base URL when resolving relative links.  This caused problems when encountering redirects.

 For example if we fetch /log, redirecting to  /log/ and find links to foo/, and bar/; these would resolve to /foo and /bar, and not /log/foo and /log/bar.
2025-01-31 12:40:13 +01:00
Viktor Lofgren
e9af838231 (actor) Fix migration actor final steps 2025-01-30 11:48:21 +01:00
Viktor Lofgren
ae0cad47c4 (actor) Utility method for getting a json prototype for actor states
If we can hook this into the control gui somehow, it'll make for a nice QOL upgrade when manually interacting with the actors.
2025-01-29 15:20:25 +01:00
Viktor Lofgren
5fbc8ef998 (misc) Tidying 2025-01-29 15:17:04 +01:00
Viktor Lofgren
32c6dd9e6a (actor) Delete old data in the migration actor 2025-01-29 14:51:46 +01:00
Viktor Lofgren
6ece6a6cfb (actor) Improve resilience for the migration actor 2025-01-29 14:43:09 +01:00
Viktor Lofgren
39cd1c18f8 Automatically run npm install tailwindcss@3 via setup.sh, as the new default version of the package is incompatible with the project 2025-01-29 12:21:08 +01:00
Viktor
eb65daaa88 Merge pull request #151 from Lionstiger/master
fix small grammar error in footerLegal.jte
2025-01-28 21:49:50 +01:00
Viktor
0bebdb6e33 Merge branch 'master' into master 2025-01-28 21:49:36 +01:00
Viktor Lofgren
1e50e392c6 (actor) Improve logging and error handling for data migration actor 2025-01-28 15:34:36 +01:00
Viktor Lofgren
fb673de370 (crawler) Change the header 'User-agent' to 'User-Agent' 2025-01-28 15:34:16 +01:00
Viktor Lofgren
eee73ab16c (crawler) Be more lenient when performing a domain probe 2025-01-28 15:24:30 +01:00
Magnus Wulf
72384ad6ca fix small grammar error 2025-01-27 15:04:57 +01:00
27 changed files with 295 additions and 140 deletions

View File

@@ -24,58 +24,4 @@ public class LanguageModels {
this.fasttextLanguageModel = fasttextLanguageModel; this.fasttextLanguageModel = fasttextLanguageModel;
this.segments = segments; this.segments = segments;
} }
public static LanguageModelsBuilder builder() {
return new LanguageModelsBuilder();
}
public static class LanguageModelsBuilder {
private Path termFrequencies;
private Path openNLPSentenceDetectionData;
private Path posRules;
private Path posDict;
private Path fasttextLanguageModel;
private Path segments;
LanguageModelsBuilder() {
}
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
this.termFrequencies = termFrequencies;
return this;
}
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
return this;
}
public LanguageModelsBuilder posRules(Path posRules) {
this.posRules = posRules;
return this;
}
public LanguageModelsBuilder posDict(Path posDict) {
this.posDict = posDict;
return this;
}
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
this.fasttextLanguageModel = fasttextLanguageModel;
return this;
}
public LanguageModelsBuilder segments(Path segments) {
this.segments = segments;
return this;
}
public LanguageModels build() {
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
}
public String toString() {
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
}
}
} }

View File

@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.*; import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function; import java.util.function.Function;
/** WorkLog is a journal of work done by a process, /** WorkLog is a journal of work done by a process,
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
return new WorkLoadIterable<>(logFile, mapper); return new WorkLoadIterable<>(logFile, mapper);
} }
public static int countEntries(Path crawlerLog) throws IOException{
try (var linesStream = Files.lines(crawlerLog)) {
return (int) linesStream.filter(WorkLogEntry::isJobId).count();
}
}
// Use synchro over concurrent set to avoid competing writes // Use synchro over concurrent set to avoid competing writes
// - correct is better than fast here, it's sketchy enough to use // - correct is better than fast here, it's sketchy enough to use
// a PrintWriter // a PrintWriter

View File

@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.NetworkInterface; import java.net.NetworkInterface;
import java.util.Enumeration; import java.util.Enumeration;
@@ -115,7 +116,7 @@ public class ServiceConfigurationModule extends AbstractModule {
} }
} }
public static String getLocalNetworkIP() throws Exception { public static String getLocalNetworkIP() throws IOException {
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces(); Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
while (nets.hasMoreElements()) { while (nets.hasMoreElements()) {

View File

@@ -6,25 +6,34 @@ import nu.marginalia.service.module.ServiceConfiguration;
import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.servlet.ServletContextHandler; import org.eclipse.jetty.servlet.ServletContextHandler;
import org.eclipse.jetty.servlet.ServletHolder; import org.eclipse.jetty.servlet.ServletHolder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
public class MetricsServer { public class MetricsServer {
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
@Inject @Inject
public MetricsServer(ServiceConfiguration configuration) throws Exception { public MetricsServer(ServiceConfiguration configuration) {
// If less than zero, we forego setting up a metrics server // If less than zero, we forego setting up a metrics server
if (configuration.metricsPort() < 0) if (configuration.metricsPort() < 0)
return; return;
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort())); try {
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
ServletContextHandler context = new ServletContextHandler(); ServletContextHandler context = new ServletContextHandler();
context.setContextPath("/"); context.setContextPath("/");
server.setHandler(context); server.setHandler(context);
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics"); context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
server.start(); server.start();
}
catch (Exception|NoSuchMethodError ex) {
logger.error("Failed to set up metrics server", ex);
}
} }
} }

View File

@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
import nu.marginalia.nodecfg.NodeConfigurationService; import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.nodecfg.model.NodeProfile; import nu.marginalia.nodecfg.model.NodeProfile;
import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration; import java.time.Duration;
import java.time.LocalDateTime; import java.time.LocalDateTime;
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
private final NodeConfigurationService nodeConfigurationService; private final NodeConfigurationService nodeConfigurationService;
private final MqPersistence persistence; private final MqPersistence persistence;
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
@Inject @Inject
public UpdateRssActor(Gson gson, public UpdateRssActor(Gson gson,
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
case UpdateRefresh(int count, long msgId) -> { case UpdateRefresh(int count, long msgId) -> {
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12)); MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
if (msg == null) { if (msg == null) {
// Retry the update logger.warn("UpdateRefresh is taking a very long time");
yield new Error("Failed to update feeds: message not found"); yield new UpdateRefresh(count, msgId);
} else if (msg.state() != MqMessageState.OK) { } else if (msg.state() != MqMessageState.OK) {
// Retry the update // Retry the update
yield new Error("Failed to update feeds: " + msg.state()); yield new Error("Failed to update feeds: " + msg.state());
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
case UpdateClean(long msgId) -> { case UpdateClean(long msgId) -> {
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12)); MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
if (msg == null) { if (msg == null) {
// Retry the update logger.warn("UpdateClean is taking a very long time");
yield new Error("Failed to update feeds: message not found"); yield new UpdateClean(msgId);
} else if (msg.state() != MqMessageState.OK) { } else if (msg.state() != MqMessageState.OK) {
// Retry the update // Retry the update
yield new Error("Failed to update feeds: " + msg.state()); yield new Error("Failed to update feeds: " + msg.state());

View File

@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.io.CrawlerOutputFile; import nu.marginalia.io.CrawlerOutputFile;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry; import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.slop.SlopCrawlDataRecord; import nu.marginalia.slop.SlopCrawlDataRecord;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorage;
@@ -18,6 +19,7 @@ import org.slf4j.LoggerFactory;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.function.Function; import java.util.function.Function;
@@ -26,14 +28,15 @@ import java.util.function.Function;
public class MigrateCrawlDataActor extends RecordActorPrototype { public class MigrateCrawlDataActor extends RecordActorPrototype {
private final FileStorageService fileStorageService; private final FileStorageService fileStorageService;
private final ServiceHeartbeat serviceHeartbeat;
private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class); private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
@Inject @Inject
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService) { public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService, ServiceHeartbeat serviceHeartbeat) {
super(gson); super(gson);
this.fileStorageService = fileStorageService; this.fileStorageService = fileStorageService;
this.serviceHeartbeat = serviceHeartbeat;
} }
public record Run(long fileStorageId) implements ActorStep {} public record Run(long fileStorageId) implements ActorStep {}
@@ -49,33 +52,50 @@ public class MigrateCrawlDataActor extends RecordActorPrototype {
Path crawlerLog = root.resolve("crawler.log"); Path crawlerLog = root.resolve("crawler.log");
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log"); Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
try (WorkLog workLog = new WorkLog(newCrawlerLog)) { int totalEntries = WorkLog.countEntries(crawlerLog);
try (WorkLog workLog = new WorkLog(newCrawlerLog);
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Migrating")
) {
int entryIdx = 0;
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) { for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
var entry = item.getKey(); final WorkLogEntry entry = item.getKey();
var path = item.getValue(); final Path inputPath = item.getValue();
logger.info("Converting {}", entry.id()); Path outputPath = inputPath;
heartbeat.progress("Migrating" + inputPath.getFileName(), entryIdx++, totalEntries);
if (inputPath.toString().endsWith(".parquet")) {
if (path.toFile().getName().endsWith(".parquet")) {
String domain = entry.id(); String domain = entry.id();
String id = Integer.toHexString(domain.hashCode()); String id = Integer.toHexString(domain.hashCode());
Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain); outputPath = CrawlerOutputFile.createSlopPath(root, id, domain);
SlopCrawlDataRecord.convertFromParquet(path, outputFile); if (Files.exists(inputPath)) {
try {
SlopCrawlDataRecord.convertFromParquet(inputPath, outputPath);
Files.deleteIfExists(inputPath);
} catch (Exception ex) {
outputPath = inputPath; // don't update the work log on error
logger.error("Failed to convert " + inputPath, ex);
}
}
else if (!Files.exists(inputPath) && !Files.exists(outputPath)) {
// if the input file is missing, and the output file is missing, we just write the log
// record identical to the old one
outputPath = inputPath;
}
}
workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt()); // Write a log entry for the (possibly) converted file
} workLog.setJobToFinished(entry.id(), outputPath.toString(), entry.cnt());
else {
workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
}
} }
} }
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log"); Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
Files.move(crawlerLog, oldCrawlerLog); Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
Files.move(newCrawlerLog, crawlerLog); Files.move(newCrawlerLog, crawlerLog);
yield new End(); yield new End();

View File

@@ -34,6 +34,7 @@ dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.commons.lang3 implementation libs.commons.lang3
implementation libs.commons.io implementation libs.commons.io
implementation libs.wiremock
implementation libs.prometheus implementation libs.prometheus
implementation libs.guava implementation libs.guava

View File

@@ -1,6 +1,7 @@
package nu.marginalia.livecapture; package nu.marginalia.livecapture;
import com.google.gson.Gson; import com.google.gson.Gson;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -12,6 +13,7 @@ import java.net.http.HttpRequest;
import java.net.http.HttpResponse; import java.net.http.HttpResponse;
import java.time.Duration; import java.time.Duration;
import java.util.Map; import java.util.Map;
import java.util.Optional;
/** Client for local browserless.io API */ /** Client for local browserless.io API */
public class BrowserlessClient implements AutoCloseable { public class BrowserlessClient implements AutoCloseable {
@@ -27,13 +29,16 @@ public class BrowserlessClient implements AutoCloseable {
private final URI browserlessURI; private final URI browserlessURI;
private final Gson gson = GsonFactory.get(); private final Gson gson = GsonFactory.get();
private final String userAgent = WmsaHome.getUserAgent().uaString();
public BrowserlessClient(URI browserlessURI) { public BrowserlessClient(URI browserlessURI) {
this.browserlessURI = browserlessURI; this.browserlessURI = browserlessURI;
} }
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException { public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
Map<String, Object> requestData = Map.of( Map<String, Object> requestData = Map.of(
"url", url, "url", url,
"userAgent", userAgent,
"gotoOptions", gotoOptions "gotoOptions", gotoOptions
); );
@@ -49,10 +54,10 @@ public class BrowserlessClient implements AutoCloseable {
if (rsp.statusCode() >= 300) { if (rsp.statusCode() >= 300) {
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode()); logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
return null; return Optional.empty();
} }
return rsp.body(); return Optional.of(rsp.body());
} }
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions) public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
@@ -60,6 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
Map<String, Object> requestData = Map.of( Map<String, Object> requestData = Map.of(
"url", url, "url", url,
"userAgent", userAgent,
"options", screenshotOptions, "options", screenshotOptions,
"gotoOptions", gotoOptions "gotoOptions", gotoOptions
); );
@@ -84,7 +90,7 @@ public class BrowserlessClient implements AutoCloseable {
} }
@Override @Override
public void close() throws Exception { public void close() {
httpClient.shutdownNow(); httpClient.shutdownNow();
} }

View File

@@ -1,5 +1,9 @@
package nu.marginalia.livecapture; package nu.marginalia.livecapture;
import com.github.tomakehurst.wiremock.WireMockServer;
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
import nu.marginalia.WmsaHome;
import nu.marginalia.service.module.ServiceConfigurationModule;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Tag;
@@ -8,34 +12,86 @@ import org.testcontainers.containers.GenericContainer;
import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.DockerImageName; import org.testcontainers.utility.DockerImageName;
import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.util.Map; import java.util.Map;
import static com.github.tomakehurst.wiremock.client.WireMock.*;
@Testcontainers @Testcontainers
@Tag("slow") @Tag("slow")
public class BrowserlessClientTest { public class BrowserlessClientTest {
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")) static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN")) .withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
.withNetworkMode("bridge")
.withExposedPorts(3000); .withExposedPorts(3000);
static WireMockServer wireMockServer =
new WireMockServer(WireMockConfiguration.wireMockConfig()
.port(18089));
static String localIp;
static URI browserlessURI;
@BeforeAll @BeforeAll
public static void setup() { public static void setup() throws IOException {
container.start(); container.start();
browserlessURI = URI.create(String.format("http://%s:%d/",
container.getHost(),
container.getMappedPort(3000))
);
wireMockServer.start();
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
localIp = ServiceConfigurationModule.getLocalNetworkIP();
}
@Tag("flaky")
@Test
public void testInspectContentUA__Flaky() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {
client.content("http://" + localIp + ":18089/",
BrowserlessClient.GotoOptions.defaultValues()
);
}
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
}
@Tag("flaky")
@Test
public void testInspectScreenshotUA__Flaky() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {
client.screenshot("http://" + localIp + ":18089/",
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues()
);
}
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
} }
@Test @Test
public void testContent() throws Exception { public void testContent() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) { try (var client = new BrowserlessClient(browserlessURI)) {
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()); var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
Assertions.assertNotNull(content, "Content should not be null");
Assertions.assertFalse(content.isBlank(), "Content should not be empty"); Assertions.assertFalse(content.isBlank(), "Content should not be empty");
} }
} }
@Test @Test
public void testScreenshot() throws Exception { public void testScreenshot() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) { try (var client = new BrowserlessClient(browserlessURI)) {
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues()); var screenshot = client.screenshot("https://www.marginalia.nu/",
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues());
Assertions.assertNotNull(screenshot, "Screenshot should not be null"); Assertions.assertNotNull(screenshot, "Screenshot should not be null");
} }
} }

View File

@@ -134,6 +134,10 @@ public class QueryExpansion {
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) { if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
graph.addVariantForSpan(prev, qw, joinedWord); graph.addVariantForSpan(prev, qw, joinedWord);
} }
else if (StringUtils.isAlpha(prev.word()) && StringUtils.isNumeric(qw.word())) { // join e.g. trs 80 to trs80 and trs-80
graph.addVariantForSpan(prev, qw, prev.word() + qw.word());
graph.addVariantForSpan(prev, qw, prev.word() + "-" + qw.word());
}
} }
prev = qw; prev = qw;

View File

@@ -213,6 +213,18 @@ public class QueryFactoryTest {
System.out.println(subquery); System.out.println(subquery);
} }
@Test
public void testContractionWordNum() {
var subquery = parseAndGetSpecs("glove 80");
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove "));
Assertions.assertTrue(subquery.query.compiledQuery.contains(" 80 "));
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove-80 "));
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove80 "));
}
@Test @Test
public void testCplusPlus() { public void testCplusPlus() {
var subquery = parseAndGetSpecs("std::vector::push_back vector"); var subquery = parseAndGetSpecs("std::vector::push_back vector");

View File

@@ -5,9 +5,7 @@ import nu.marginalia.actor.state.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.List;
public abstract class RecordActorPrototype implements ActorPrototype { public abstract class RecordActorPrototype implements ActorPrototype {
@@ -118,7 +116,7 @@ public abstract class RecordActorPrototype implements ActorPrototype {
} }
private String functionName(Class<? extends ActorStep> functionClass) { private String functionName(Class<? extends ActorStep> functionClass) {
return functionClass.getSimpleName().toUpperCase(); return ActorStep.functionName(functionClass);
} }
private ActorStep constructState(String message) throws ReflectiveOperationException { private ActorStep constructState(String message) throws ReflectiveOperationException {
@@ -145,4 +143,43 @@ public abstract class RecordActorPrototype implements ActorPrototype {
} }
} }
/** Get a list of JSON prototypes for each actor step declared by this actor */
@SuppressWarnings("unchecked")
public Map<String, String> getMessagePrototypes() {
Map<String, String> messagePrototypes = new HashMap<>();
for (var clazz : getClass().getDeclaredClasses()) {
if (!clazz.isRecord() || !ActorStep.class.isAssignableFrom(clazz))
continue;
StringJoiner sj = new StringJoiner(",\n\t", "{\n\t", "\n}");
renderToJsonPrototype(sj, (Class<? extends Record>) clazz);
messagePrototypes.put(ActorStep.functionName((Class<? extends ActorStep>) clazz), sj.toString());
}
return messagePrototypes;
}
@SuppressWarnings("unchecked")
private void renderToJsonPrototype(StringJoiner sj, Class<? extends Record> recordType) {
for (var field : recordType.getDeclaredFields()) {
String typeName = field.getType().getSimpleName();
if ("List".equals(typeName)) {
sj.add(String.format("\"%s\": [ ]", field.getName()));
}
else if (field.getType().isRecord()) {
var innerSj = new StringJoiner(",", "{", "}");
renderToJsonPrototype(innerSj, (Class<? extends Record>) field.getType());
sj.add(String.format("\"%s\": %s", field.getName(), sj));
}
else {
sj.add(String.format("\"%s\": \"%s\"", field.getName(), typeName));
}
}
}
} }

View File

@@ -1,3 +1,7 @@
package nu.marginalia.actor.state; package nu.marginalia.actor.state;
public interface ActorStep {} public interface ActorStep {
static String functionName(Class<? extends ActorStep> type) {
return type.getSimpleName().toUpperCase();
}
}

View File

@@ -35,6 +35,7 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
@@ -202,13 +203,19 @@ public class ConverterMain extends ProcessMainClass {
heartbeat.setProgress(processedDomains.get() / (double) totalDomains); heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
logger.info("Processing small items"); logger.info("Processing small items");
int numBigTasks = 0;
// We separate the large and small domains to reduce the number of critical sections,
// as the large domains have a separate processing track that doesn't store everything
// in memory
final List<Path> bigTasks = new ArrayList<>();
// First process the small items // First process the small items
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(), for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog))) new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
{ {
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) { if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
numBigTasks ++; bigTasks.add(dataPath);
continue; continue;
} }
@@ -239,15 +246,8 @@ public class ConverterMain extends ProcessMainClass {
try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) { try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
int bigTaskIdx = 0; int bigTaskIdx = 0;
// Next the big items domain-by-domain // Next the big items domain-by-domain
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(), for (var dataPath : bigTasks) {
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog))) hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size());
{
int sizeHint = SerializableCrawlDataStream.getSizeHint(dataPath);
if (sizeHint < SIDELOAD_THRESHOLD) {
continue;
}
hb.progress(dataPath.toFile().getName(), bigTaskIdx++, numBigTasks);
try { try {
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be // SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
@@ -255,7 +255,7 @@ public class ConverterMain extends ProcessMainClass {
// will close it after it's consumed. // will close it after it's consumed.
var stream = SerializableCrawlDataStream.openDataStream(dataPath); var stream = SerializableCrawlDataStream.openDataStream(dataPath);
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, sizeHint); ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath));
converterWriter.accept(writable); converterWriter.accept(writable);
} }

View File

@@ -116,7 +116,7 @@ public class AdblockSimulator {
// Refrain from cleaning up this code, it's very hot code and needs to be fast. // Refrain from cleaning up this code, it's very hot code and needs to be fast.
// This version is about 100x faster than the a "clean" first stab implementation. // This version is about 100x faster than a "clean" first stab implementation.
class RuleVisitor implements NodeFilter { class RuleVisitor implements NodeFilter {
public boolean sawAds; public boolean sawAds;

View File

@@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor {
var tags = doc.select("meta[name=generator]"); var tags = doc.select("meta[name=generator]");
if (tags.size() == 0) { if (tags.isEmpty()) {
// Some sites have a comment in the head instead of a meta tag // Some sites have a comment in the head instead of a meta tag
return fingerprintServerTech(doc, responseHeaders); return fingerprintServerTech(doc, responseHeaders);
} }

View File

@@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
} }
fullHtml.append("</div></body></html>"); fullHtml.append("</div></body></html>");
var doc = sideloaderProcessing return sideloaderProcessing
.processDocument(fullUrl, .processDocument(fullUrl,
fullHtml.toString(), fullHtml.toString(),
List.of("encyclopedia", "wiki"), List.of("encyclopedia", "wiki"),
@@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)), anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
LocalDate.now().getYear(), LocalDate.now().getYear(),
10_000_000); 10_000_000);
return doc;
} }
private String normalizeUtf8(String url) { private String normalizeUtf8(String url) {

View File

@@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private final Duration requestTimeout = Duration.ofSeconds(10); private final Duration requestTimeout = Duration.ofSeconds(10);
private final Duration probeTimeout = Duration.ofSeconds(30);
@Override @Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) { public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@@ -107,23 +108,27 @@ public class HttpFetcherImpl implements HttpFetcher {
.HEAD() .HEAD()
.uri(url.asURI()) .uri(url.asURI())
.header("User-agent", userAgentString) .header("User-agent", userAgentString)
.timeout(requestTimeout) .timeout(probeTimeout)
.build(); .build();
} catch (URISyntaxException e) { } catch (URISyntaxException e) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL"); return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
} }
try { for (int tries = 0;; tries++) {
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding()); try {
EdgeUrl rspUri = new EdgeUrl(rsp.uri()); var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
if (!Objects.equals(rspUri.domain, url.domain)) { if (!Objects.equals(rspUri.domain, url.domain)) {
return new DomainProbeResult.Redirect(rspUri.domain); return new DomainProbeResult.Redirect(rspUri.domain);
}
return new DomainProbeResult.Ok(rspUri);
} catch (Exception ex) {
if (tries > 3) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
}
// else try again ...
} }
return new DomainProbeResult.Ok(rspUri);
}
catch (Exception ex) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
} }
} }
@@ -143,7 +148,7 @@ public class HttpFetcherImpl implements HttpFetcher {
var headBuilder = HttpRequest.newBuilder() var headBuilder = HttpRequest.newBuilder()
.HEAD() .HEAD()
.uri(url.asURI()) .uri(url.asURI())
.header("User-agent", userAgentString) .header("User-Agent", userAgentString)
.header("Accept-Encoding", "gzip") .header("Accept-Encoding", "gzip")
.timeout(requestTimeout) .timeout(requestTimeout)
; ;
@@ -215,7 +220,7 @@ public class HttpFetcherImpl implements HttpFetcher {
var getBuilder = HttpRequest.newBuilder() var getBuilder = HttpRequest.newBuilder()
.GET() .GET()
.uri(url.asURI()) .uri(url.asURI())
.header("User-agent", userAgentString) .header("User-Agent", userAgentString)
.header("Accept-Encoding", "gzip") .header("Accept-Encoding", "gzip")
.header("Accept-Language", "en,*;q=0.5") .header("Accept-Language", "en,*;q=0.5")
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8") .header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
@@ -307,7 +312,7 @@ public class HttpFetcherImpl implements HttpFetcher {
.uri(sitemapUrl.asURI()) .uri(sitemapUrl.asURI())
.header("Accept-Encoding", "gzip") .header("Accept-Encoding", "gzip")
.header("Accept", "text/*, */*;q=0.9") .header("Accept", "text/*, */*;q=0.9")
.header("User-agent", userAgentString) .header("User-Agent", userAgentString)
.timeout(requestTimeout) .timeout(requestTimeout)
.build(); .build();
@@ -386,7 +391,7 @@ public class HttpFetcherImpl implements HttpFetcher {
.uri(url.asURI()) .uri(url.asURI())
.header("Accept-Encoding", "gzip") .header("Accept-Encoding", "gzip")
.header("Accept", "text/*, */*;q=0.9") .header("Accept", "text/*, */*;q=0.9")
.header("User-agent", userAgentString) .header("User-Agent", userAgentString)
.timeout(requestTimeout); .timeout(requestTimeout);
HttpFetchResult result = recorder.fetch(client, getRequest.build()); HttpFetchResult result = recorder.fetch(client, getRequest.build());

View File

@@ -381,8 +381,10 @@ public class CrawlerRetreiver implements AutoCloseable {
if (docOpt.isPresent()) { if (docOpt.isPresent()) {
var doc = docOpt.get(); var doc = docOpt.get();
crawlFrontier.enqueueLinksFromDocument(top, doc); var responseUrl = new EdgeUrl(ok.uri());
crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
crawlFrontier.addVisited(responseUrl);
} }
} }
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) { else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {

View File

@@ -12,8 +12,7 @@ import java.io.InputStream;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.URI; import java.net.URI;
import java.net.http.HttpHeaders; import java.net.http.HttpHeaders;
import java.util.Arrays; import java.util.*;
import java.util.Optional;
/* FIXME: This interface has a very unfortunate name that is not very descriptive. /* FIXME: This interface has a very unfortunate name that is not very descriptive.
*/ */
@@ -65,7 +64,21 @@ public sealed interface HttpFetchResult {
) implements HttpFetchResult { ) implements HttpFetchResult {
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) { public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length); this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
}
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
Map<String, List<String>> inputMap = messageHeaders.map();
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
inputMap.forEach((k, v) -> {
if (k.isBlank()) return;
if (!Character.isAlphabetic(k.charAt(0))) return;
filteredMap.put(k, v);
});
return HttpHeaders.of(filteredMap, (k,v) -> true);
} }
public boolean isOk() { public boolean isOk() {

View File

@@ -3,8 +3,10 @@ package nu.marginalia.search;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.jooby.Context; import io.jooby.Context;
import io.jooby.Jooby; import io.jooby.Jooby;
import io.jooby.StatusCode;
import io.prometheus.client.Counter; import io.prometheus.client.Counter;
import io.prometheus.client.Histogram; import io.prometheus.client.Histogram;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.svc.*; import nu.marginalia.search.svc.*;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams; import nu.marginalia.service.server.BaseServiceParams;
@@ -16,6 +18,7 @@ import java.util.List;
public class SearchService extends JoobyService { public class SearchService extends JoobyService {
private final WebsiteUrl websiteUrl;
private final SearchSiteSubscriptionService siteSubscriptionService; private final SearchSiteSubscriptionService siteSubscriptionService;
private static final Logger logger = LoggerFactory.getLogger(SearchService.class); private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
@@ -33,6 +36,7 @@ public class SearchService extends JoobyService {
@Inject @Inject
public SearchService(BaseServiceParams params, public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
SearchFrontPageService frontPageService, SearchFrontPageService frontPageService,
SearchAddToCrawlQueueService addToCrawlQueueService, SearchAddToCrawlQueueService addToCrawlQueueService,
SearchSiteSubscriptionService siteSubscriptionService, SearchSiteSubscriptionService siteSubscriptionService,
@@ -51,6 +55,7 @@ public class SearchService extends JoobyService {
new SearchAddToCrawlQueueService_(addToCrawlQueueService), new SearchAddToCrawlQueueService_(addToCrawlQueueService),
new SearchBrowseService_(searchBrowseService) new SearchBrowseService_(searchBrowseService)
)); ));
this.websiteUrl = websiteUrl;
this.siteSubscriptionService = siteSubscriptionService; this.siteSubscriptionService = siteSubscriptionService;
} }
@@ -62,6 +67,10 @@ public class SearchService extends JoobyService {
final String startTimeAttribute = "start-time"; final String startTimeAttribute = "start-time";
jooby.get("/export-opml", siteSubscriptionService::exportOpml); jooby.get("/export-opml", siteSubscriptionService::exportOpml);
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
jooby.before((Context ctx) -> { jooby.before((Context ctx) -> {
ctx.setAttribute(startTimeAttribute, System.nanoTime()); ctx.setAttribute(startTimeAttribute, System.nanoTime());
}); });
@@ -80,5 +89,19 @@ public class SearchService extends JoobyService {
}); });
} }
/** Redirect handler for the case when the user passes
* an url like /site/https://example.com/, in this
* scenario we want to extract the domain name and redirect
* to /site/example.com/
*/
private Context handleSiteUrlRedirect(Context ctx) {
var pv = ctx.path("*").value();
int trailSlash = pv.indexOf('/');
if (trailSlash > 0) {
pv = pv.substring(0, trailSlash);
}
ctx.sendRedirect(StatusCode.TEMPORARY_REDIRECT, websiteUrl.withPath("site/" + pv));
return ctx;
}
} }

View File

@@ -86,8 +86,10 @@ public record SearchParameters(WebsiteUrl url,
public String renderUrl() { public String renderUrl() {
StringBuilder pathBuilder = new StringBuilder("/search?"); StringBuilder pathBuilder = new StringBuilder("/search?");
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
if (query != null) {
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
}
if (profile != SearchProfile.NO_FILTER) { if (profile != SearchProfile.NO_FILTER) {
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8)); pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
} }

View File

@@ -56,7 +56,9 @@ public class SearchQueryService {
} }
catch (Exception ex) { catch (Exception ex) {
logger.error("Error", ex); logger.error("Error", ex);
return errorPageService.serveError(SearchParameters.defaultsForQuery(websiteUrl, query, page)); return errorPageService.serveError(
SearchParameters.defaultsForQuery(websiteUrl, query, Objects.requireNonNullElse(page, 1))
);
} }
} }

View File

@@ -9,7 +9,7 @@
<span> <span>
Access logs containing IP-addresses are retained for up to 24 hours, Access logs containing IP-addresses are retained for up to 24 hours,
anonymized logs with source addresses removed are sometimes kept longer anonymized logs with source addresses removed are sometimes kept longer
to help diagnosing bugs. to help diagnose bugs.
</span> </span>
</div> </div>
<div class="flex space-y-4 flex-col"> <div class="flex space-y-4 flex-col">
@@ -33,4 +33,4 @@
</span> </span>
</div> </div>
</footer> </footer>

View File

@@ -16,8 +16,6 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/); The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
graalce is a good distribution choice but it doesn't matter too much. graalce is a good distribution choice but it doesn't matter too much.
**Tailwindcss** - Install NPM and run `npm install tailwindcss @tailwindcss/cli`
## Quick Set up ## Quick Set up
[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install [https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install

View File

@@ -74,3 +74,7 @@ download_model model/tfreq-new-algo3.bin https://huggingface.co/MarginaliaNu/Mar
download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569 download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569
popd popd
pushd $(dirname $0)/..
npm install -D tailwindcss@3
popd

View File

@@ -239,6 +239,7 @@ dependencyResolutionManagement {
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion) library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion) library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
library('jte','gg.jte','jte').version('3.1.15') library('jte','gg.jte','jte').version('3.1.15')
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet']) bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])