mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
13 Commits
deploy-027
...
deploy-028
Author | SHA1 | Date | |
---|---|---|---|
|
a65d18f1d1 | ||
|
6e214293e5 | ||
|
52582a6d7d | ||
|
ec0e39ad32 | ||
|
6a15aee4b0 | ||
|
bd5111e8a2 | ||
|
1ecbeb0272 | ||
|
390f053406 | ||
|
b03c43224c | ||
|
9b4ce9e9eb | ||
|
81ac02a695 | ||
|
47f624fb3b | ||
|
c866f19cbb |
@@ -48,10 +48,6 @@ filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Show favicons next to search results (COMPLETED 2025-03)
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.Function;
|
||||
|
||||
@Singleton
|
||||
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
|
||||
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
private static final Executor executor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
@Inject
|
||||
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,
|
||||
|
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class GrpcServer {
|
||||
private final Server server;
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
public GrpcServer(ServiceConfiguration config,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
ServicePartition partition,
|
||||
@@ -26,8 +31,13 @@ public class GrpcServer {
|
||||
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
|
||||
|
||||
// Start the gRPC server
|
||||
|
||||
ExecutorService workExecutor = useLoom ?
|
||||
Executors.newVirtualThreadPerTaskExecutor() :
|
||||
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
|
||||
|
||||
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
|
||||
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
|
||||
.executor(workExecutor)
|
||||
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
|
||||
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
|
||||
.channelType(NioServerSocketChannel.class);
|
||||
|
@@ -47,6 +47,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
|
||||
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
|
||||
|
||||
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Wait(String ts) implements ActorStep {}
|
||||
@@ -57,6 +59,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
if (!insertFoundDomains) yield new Error("Domain insertion prohibited, aborting");
|
||||
|
||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||
yield new Error("Invalid node profile for RSS update");
|
||||
}
|
||||
|
@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
@Singleton
|
||||
public class DomainInfoClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public DomainInfoClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -24,7 +24,9 @@ import java.util.function.BiConsumer;
|
||||
|
||||
@Singleton
|
||||
public class FeedsClient {
|
||||
private final ExecutorService executorService = Executors.newCachedThreadPool();
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executorService = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||
|
||||
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
|
||||
private final MqOutbox updateFeedsOutbox;
|
||||
|
||||
|
@@ -26,7 +26,9 @@ public class MathClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(MathClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<MathApiGrpc.MathApiBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public MathClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -38,7 +38,9 @@ public class IndexClient {
|
||||
.help("Count of results filtered by NSFW tier")
|
||||
.register();
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||
|
@@ -40,6 +40,8 @@ public class LoaderMain extends ProcessMainClass {
|
||||
private final KeywordLoaderService keywordLoaderService;
|
||||
private final DocumentLoaderService documentLoaderService;
|
||||
|
||||
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||
|
||||
public static void main(String... args) {
|
||||
try {
|
||||
new org.mariadb.jdbc.Driver();
|
||||
@@ -99,14 +101,29 @@ public class LoaderMain extends ProcessMainClass {
|
||||
|
||||
try {
|
||||
var results = ForkJoinPool.commonPool()
|
||||
.invokeAll(
|
||||
List.of(
|
||||
() -> linksService.loadLinks(domainIdRegistry, heartbeat, inputData),
|
||||
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
|
||||
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
|
||||
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
|
||||
)
|
||||
);
|
||||
.invokeAll(List.of());
|
||||
|
||||
if ( true == insertFoundDomains ) {
|
||||
results = ForkJoinPool.commonPool()
|
||||
.invokeAll(
|
||||
List.of(
|
||||
() -> linksService.loadLinks(domainIdRegistry, heartbeat, inputData),
|
||||
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
|
||||
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
|
||||
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
|
||||
)
|
||||
);
|
||||
}
|
||||
else {
|
||||
results = ForkJoinPool.commonPool()
|
||||
.invokeAll(
|
||||
List.of(
|
||||
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
|
||||
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
|
||||
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
for (var result : results) {
|
||||
if (result.state() == Future.State.FAILED) {
|
||||
|
@@ -25,6 +25,8 @@ import java.util.Set;
|
||||
@Singleton
|
||||
public class DomainLoaderService {
|
||||
|
||||
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(DomainLoaderService.class);
|
||||
private final int nodeId;
|
||||
@@ -84,25 +86,34 @@ public class DomainLoaderService {
|
||||
|
||||
// Add domains that are linked to from the domains we've just crawled, but with -1 affinity meaning they
|
||||
// can be grabbed by any index node
|
||||
try (var inserter = new DomainInserter(conn, -1);
|
||||
var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("INSERT_LINKED_DOMAINS")) {
|
||||
// Add linked domains, but with -1 affinity meaning they can be grabbed by any index node
|
||||
int pageIdx = 0;
|
||||
if ( true == insertFoundDomains ) {
|
||||
logger.info("Adding found domains");
|
||||
|
||||
for (SlopTable.Ref<SlopDomainLinkRecord> page : inputData.listDomainLinkPages()) {
|
||||
processHeartbeat.progress("INSERT", pageIdx++, domainLinkPageRefs.size());
|
||||
try (var inserter = new DomainInserter(conn, -1);
|
||||
var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("INSERT_LINKED_DOMAINS")) {
|
||||
// Add linked domains, but with -1 affinity meaning they can be grabbed by any index node
|
||||
int pageIdx = 0;
|
||||
|
||||
try (var reader = new SlopDomainLinkRecord.Reader(page)) {
|
||||
while (reader.hasMore()) {
|
||||
SlopDomainLinkRecord record = reader.next();
|
||||
String domainName = record.dest();
|
||||
if (domainNamesAll.add(domainName)) {
|
||||
inserter.accept(new EdgeDomain(domainName));
|
||||
for (SlopTable.Ref<SlopDomainLinkRecord> page : inputData.listDomainLinkPages()) {
|
||||
processHeartbeat.progress("INSERT", pageIdx++, domainLinkPageRefs.size());
|
||||
|
||||
try (var reader = new SlopDomainLinkRecord.Reader(page)) {
|
||||
while (reader.hasMore()) {
|
||||
SlopDomainLinkRecord record = reader.next();
|
||||
String domainName = record.dest();
|
||||
if (domainNamesAll.add(domainName)) {
|
||||
inserter.accept(new EdgeDomain(domainName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
logger.info("Skipping found domains");
|
||||
}
|
||||
|
||||
|
||||
taskHeartbeat.progress(Steps.UPDATE_AFFINITY_AND_IP);
|
||||
|
||||
|
@@ -61,14 +61,14 @@ public class BackoffStrategy {
|
||||
};
|
||||
|
||||
double backoffMinutes = baseInterval.toMinutes()
|
||||
* Math.pow(multiplier, backoffConsecutiveFailures - 1);
|
||||
* Math.pow(multiplier, Math.clamp(backoffConsecutiveFailures, 1, 10));
|
||||
|
||||
Duration newDuration = Duration.ofMinutes(Math.round(0.5+backoffMinutes));
|
||||
if (newDuration.compareTo(maxInterval) > 0) {
|
||||
var backoffVal = Math.round(0.5+backoffMinutes);
|
||||
if (backoffVal > maxInterval.toMinutes()) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
return newDuration;
|
||||
return Duration.ofMinutes(backoffVal);
|
||||
}
|
||||
|
||||
private Duration addJitter(Duration duration) {
|
||||
|
@@ -30,10 +30,11 @@ public class ApiSearchOperator {
|
||||
|
||||
public ApiSearchResults query(String query,
|
||||
int count,
|
||||
int domainCount,
|
||||
int index,
|
||||
NsfwFilterTier filterTier)
|
||||
{
|
||||
var rsp = queryClient.search(createParams(query, count, index, filterTier));
|
||||
var rsp = queryClient.search(createParams(query, count, domainCount, index, filterTier));
|
||||
|
||||
return new ApiSearchResults("RESTRICTED", query,
|
||||
rsp.results()
|
||||
@@ -44,13 +45,13 @@ public class ApiSearchOperator {
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
private QueryParams createParams(String query, int count, int index, NsfwFilterTier filterTirer) {
|
||||
private QueryParams createParams(String query, int count, int domainCount, int index, NsfwFilterTier filterTirer) {
|
||||
SearchSetIdentifier searchSet = selectSearchSet(index);
|
||||
|
||||
return new QueryParams(
|
||||
query,
|
||||
RpcQueryLimits.newBuilder()
|
||||
.setResultsByDomain(2)
|
||||
.setResultsByDomain(Math.clamp(domainCount, 1, 100))
|
||||
.setResultsTotal(Math.min(100, count))
|
||||
.setTimeoutMs(150)
|
||||
.setFetchSize(8192)
|
||||
|
@@ -119,6 +119,7 @@ public class ApiService extends SparkService {
|
||||
}
|
||||
|
||||
int count = intParam(request, "count", 20);
|
||||
int domainCount = intParam(request, "dc", 2);
|
||||
int index = intParam(request, "index", 3);
|
||||
int nsfw = intParam(request, "nsfw", 1);
|
||||
|
||||
@@ -137,7 +138,7 @@ public class ApiService extends SparkService {
|
||||
.labels(license.key)
|
||||
.time(() ->
|
||||
searchOperator
|
||||
.query(query, count, index, nsfwFilterTier)
|
||||
.query(query, count, domainCount, index, nsfwFilterTier)
|
||||
.withLicense(license.getLicense())
|
||||
);
|
||||
}
|
||||
|
@@ -20,7 +20,7 @@ public class BangCommand implements SearchCommandInterface {
|
||||
{
|
||||
bangsToPattern.put("!g", "https://www.google.com/search?q=%s");
|
||||
bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s");
|
||||
bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
|
||||
bangsToPattern.put("!w", "https://old-search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -20,7 +20,7 @@ public class BangCommand implements SearchCommandInterface {
|
||||
{
|
||||
bangsToPattern.put("!g", "https://www.google.com/search?q=%s");
|
||||
bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s");
|
||||
bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
|
||||
bangsToPattern.put("!w", "/search?query=%s+site:en.wikipedia.org");
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -34,7 +34,7 @@ public class BangCommand implements SearchCommandInterface {
|
||||
|
||||
if (match.isPresent()) {
|
||||
var url = String.format(redirectPattern, URLEncoder.encode(match.get(), StandardCharsets.UTF_8));
|
||||
new MapModelAndView("redirect.jte", Map.of("url", url));
|
||||
return Optional.of(new MapModelAndView("redirect.jte", Map.of("url", url)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -0,0 +1,19 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class BangCommandTest {
|
||||
|
||||
@Test
|
||||
void testWikipediaRedirect() {
|
||||
BangCommand bc = new BangCommand();
|
||||
|
||||
assertTrue(bc.process(SearchParameters.defaultsForQuery(new WebsiteUrl("test"), "!w plato", 1)).isPresent());
|
||||
assertFalse(bc.process(SearchParameters.defaultsForQuery(new WebsiteUrl("test"), "plato", 1)).isPresent());
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user