1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

13 Commits

Author SHA1 Message Date
Viktor Lofgren
a65d18f1d1 (client) Use virtual threads in a few more clients 2025-07-20 14:10:02 +02:00
Viktor Lofgren
6e214293e5 (ping) Fix backoff value overflow 2025-07-16 19:50:12 +02:00
Viktor Lofgren
52582a6d7d (experiment) Also add clients to loom experiment 2025-07-16 18:08:00 +02:00
Viktor Lofgren
ec0e39ad32 (experiment) Also add clients to loom experiment 2025-07-16 17:28:57 +02:00
Viktor Lofgren
6a15aee4b0 (ping) Fix arithmetic errors in backoff strategy due to long overflow 2025-07-16 17:23:36 +02:00
Viktor Lofgren
bd5111e8a2 (experimental) Add flag for using loom/virtual threads in gRPC executor 2025-07-16 17:12:07 +02:00
Viktor Lofgren
1ecbeb0272 (doc) Update ROADMAP.md 2025-07-14 13:38:34 +02:00
Viktor Lofgren
390f053406 (api) Add query parameter 'dc' for specifying the max number of results per domain 2025-07-14 10:09:30 +02:00
Viktor Lofgren
b03c43224c (search) Fix redirects in new search UI 2025-07-11 23:44:45 +02:00
Viktor Lofgren
9b4ce9e9eb (search) Fix !w redirect 2025-07-11 23:28:09 +02:00
Viktor
81ac02a695 Merge pull request #209 from us3r1d/master
added converter.insertFoundDomains property
2025-07-11 21:34:04 +02:00
krystal
47f624fb3b changed converter.insertFoundDomains to loader.insertFoundDomains 2025-07-11 12:13:45 -07:00
krystal
c866f19cbb added converter.insertFoundDomains property 2025-07-10 15:36:59 -07:00
16 changed files with 127 additions and 47 deletions

View File

@@ -48,10 +48,6 @@ filter for any API consumer.
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
## Show favicons next to search results
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
## Specialized crawler for github
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
that direction as well.
## Show favicons next to search results (COMPLETED 2025-03)
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
## Web Design Overhaul (COMPLETED 2025-01)
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.

View File

@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.util.NamedExecutorFactory;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.function.Function;
@Singleton
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
private final NodeConfigurationWatcher nodeConfigurationWatcher;
private final ServiceRegistryIf serviceRegistryIf;
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final Executor executor = useLoom
? Executors.newVirtualThreadPerTaskExecutor()
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final Executor offloadExecutor = useLoom
? Executors.newVirtualThreadPerTaskExecutor()
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
@Inject
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,

View File

@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class GrpcServer {
private final Server server;
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
public GrpcServer(ServiceConfiguration config,
ServiceRegistryIf serviceRegistry,
ServicePartition partition,
@@ -26,8 +31,13 @@ public class GrpcServer {
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
// Start the gRPC server
ExecutorService workExecutor = useLoom ?
Executors.newVirtualThreadPerTaskExecutor() :
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
.executor(workExecutor)
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
.channelType(NioServerSocketChannel.class);

View File

@@ -47,6 +47,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
public record Initial() implements ActorStep {}
@Resume(behavior = ActorResumeBehavior.RETRY)
public record Wait(String ts) implements ActorStep {}
@@ -57,6 +59,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Initial() -> {
if (!insertFoundDomains) yield new Error("Domain insertion prohibited, aborting");
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
yield new Error("Invalid node profile for RSS update");
}

View File

@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.concurrent.*;
import nu.marginalia.api.domains.model.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
@Singleton
public class DomainInfoClient {
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
private final ExecutorService executor = Executors.newWorkStealingPool(8);
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
@Inject
public DomainInfoClient(GrpcChannelPoolFactory factory) {

View File

@@ -24,7 +24,9 @@ import java.util.function.BiConsumer;
@Singleton
public class FeedsClient {
private final ExecutorService executorService = Executors.newCachedThreadPool();
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executorService = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
private final MqOutbox updateFeedsOutbox;

View File

@@ -26,7 +26,9 @@ public class MathClient {
private static final Logger logger = LoggerFactory.getLogger(MathClient.class);
private final GrpcSingleNodeChannelPool<MathApiGrpc.MathApiBlockingStub> channelPool;
private final ExecutorService executor = Executors.newWorkStealingPool(8);
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
@Inject
public MathClient(GrpcChannelPoolFactory factory) {

View File

@@ -38,7 +38,9 @@ public class IndexClient {
.help("Count of results filtered by NSFW tier")
.register();
private static final ExecutorService executor = Executors.newCachedThreadPool();
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
@Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,

View File

@@ -40,6 +40,8 @@ public class LoaderMain extends ProcessMainClass {
private final KeywordLoaderService keywordLoaderService;
private final DocumentLoaderService documentLoaderService;
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
public static void main(String... args) {
try {
new org.mariadb.jdbc.Driver();
@@ -99,14 +101,29 @@ public class LoaderMain extends ProcessMainClass {
try {
var results = ForkJoinPool.commonPool()
.invokeAll(
List.of(
() -> linksService.loadLinks(domainIdRegistry, heartbeat, inputData),
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
)
);
.invokeAll(List.of());
if ( true == insertFoundDomains ) {
results = ForkJoinPool.commonPool()
.invokeAll(
List.of(
() -> linksService.loadLinks(domainIdRegistry, heartbeat, inputData),
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
)
);
}
else {
results = ForkJoinPool.commonPool()
.invokeAll(
List.of(
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
)
);
}
for (var result : results) {
if (result.state() == Future.State.FAILED) {

View File

@@ -25,6 +25,8 @@ import java.util.Set;
@Singleton
public class DomainLoaderService {
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(DomainLoaderService.class);
private final int nodeId;
@@ -84,25 +86,34 @@ public class DomainLoaderService {
// Add domains that are linked to from the domains we've just crawled, but with -1 affinity meaning they
// can be grabbed by any index node
try (var inserter = new DomainInserter(conn, -1);
var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("INSERT_LINKED_DOMAINS")) {
// Add linked domains, but with -1 affinity meaning they can be grabbed by any index node
int pageIdx = 0;
if ( true == insertFoundDomains ) {
logger.info("Adding found domains");
for (SlopTable.Ref<SlopDomainLinkRecord> page : inputData.listDomainLinkPages()) {
processHeartbeat.progress("INSERT", pageIdx++, domainLinkPageRefs.size());
try (var inserter = new DomainInserter(conn, -1);
var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("INSERT_LINKED_DOMAINS")) {
// Add linked domains, but with -1 affinity meaning they can be grabbed by any index node
int pageIdx = 0;
try (var reader = new SlopDomainLinkRecord.Reader(page)) {
while (reader.hasMore()) {
SlopDomainLinkRecord record = reader.next();
String domainName = record.dest();
if (domainNamesAll.add(domainName)) {
inserter.accept(new EdgeDomain(domainName));
for (SlopTable.Ref<SlopDomainLinkRecord> page : inputData.listDomainLinkPages()) {
processHeartbeat.progress("INSERT", pageIdx++, domainLinkPageRefs.size());
try (var reader = new SlopDomainLinkRecord.Reader(page)) {
while (reader.hasMore()) {
SlopDomainLinkRecord record = reader.next();
String domainName = record.dest();
if (domainNamesAll.add(domainName)) {
inserter.accept(new EdgeDomain(domainName));
}
}
}
}
}
}
else {
logger.info("Skipping found domains");
}
taskHeartbeat.progress(Steps.UPDATE_AFFINITY_AND_IP);

View File

@@ -61,14 +61,14 @@ public class BackoffStrategy {
};
double backoffMinutes = baseInterval.toMinutes()
* Math.pow(multiplier, backoffConsecutiveFailures - 1);
* Math.pow(multiplier, Math.clamp(backoffConsecutiveFailures, 1, 10));
Duration newDuration = Duration.ofMinutes(Math.round(0.5+backoffMinutes));
if (newDuration.compareTo(maxInterval) > 0) {
var backoffVal = Math.round(0.5+backoffMinutes);
if (backoffVal > maxInterval.toMinutes()) {
return maxInterval;
}
return newDuration;
return Duration.ofMinutes(backoffVal);
}
private Duration addJitter(Duration duration) {

View File

@@ -30,10 +30,11 @@ public class ApiSearchOperator {
public ApiSearchResults query(String query,
int count,
int domainCount,
int index,
NsfwFilterTier filterTier)
{
var rsp = queryClient.search(createParams(query, count, index, filterTier));
var rsp = queryClient.search(createParams(query, count, domainCount, index, filterTier));
return new ApiSearchResults("RESTRICTED", query,
rsp.results()
@@ -44,13 +45,13 @@ public class ApiSearchOperator {
.collect(Collectors.toList()));
}
private QueryParams createParams(String query, int count, int index, NsfwFilterTier filterTirer) {
private QueryParams createParams(String query, int count, int domainCount, int index, NsfwFilterTier filterTirer) {
SearchSetIdentifier searchSet = selectSearchSet(index);
return new QueryParams(
query,
RpcQueryLimits.newBuilder()
.setResultsByDomain(2)
.setResultsByDomain(Math.clamp(domainCount, 1, 100))
.setResultsTotal(Math.min(100, count))
.setTimeoutMs(150)
.setFetchSize(8192)

View File

@@ -119,6 +119,7 @@ public class ApiService extends SparkService {
}
int count = intParam(request, "count", 20);
int domainCount = intParam(request, "dc", 2);
int index = intParam(request, "index", 3);
int nsfw = intParam(request, "nsfw", 1);
@@ -137,7 +138,7 @@ public class ApiService extends SparkService {
.labels(license.key)
.time(() ->
searchOperator
.query(query, count, index, nsfwFilterTier)
.query(query, count, domainCount, index, nsfwFilterTier)
.withLicense(license.getLicense())
);
}

View File

@@ -20,7 +20,7 @@ public class BangCommand implements SearchCommandInterface {
{
bangsToPattern.put("!g", "https://www.google.com/search?q=%s");
bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s");
bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
bangsToPattern.put("!w", "https://old-search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
}
@Override

View File

@@ -20,7 +20,7 @@ public class BangCommand implements SearchCommandInterface {
{
bangsToPattern.put("!g", "https://www.google.com/search?q=%s");
bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s");
bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
bangsToPattern.put("!w", "/search?query=%s+site:en.wikipedia.org");
}
@Override
@@ -34,7 +34,7 @@ public class BangCommand implements SearchCommandInterface {
if (match.isPresent()) {
var url = String.format(redirectPattern, URLEncoder.encode(match.get(), StandardCharsets.UTF_8));
new MapModelAndView("redirect.jte", Map.of("url", url));
return Optional.of(new MapModelAndView("redirect.jte", Map.of("url", url)));
}
}

View File

@@ -0,0 +1,19 @@
package nu.marginalia.search.command.commands;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.command.SearchParameters;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
class BangCommandTest {
@Test
void testWikipediaRedirect() {
BangCommand bc = new BangCommand();
assertTrue(bc.process(SearchParameters.defaultsForQuery(new WebsiteUrl("test"), "!w plato", 1)).isPresent());
assertFalse(bc.process(SearchParameters.defaultsForQuery(new WebsiteUrl("test"), "plato", 1)).isPresent());
}
}