mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
108 Commits
deploy-001
...
deploy-004
Author | SHA1 | Date | |
---|---|---|---|
|
03ba53ce51 | ||
|
d4a6684931 | ||
|
6f0485287a | ||
|
59e2dd4c26 | ||
|
ca1807caae | ||
|
26c20e18ac | ||
|
7c90b6b414 | ||
|
b63c54c4ce | ||
|
fecd2f4ec3 | ||
|
39e420de88 | ||
|
dc83619861 | ||
|
87d1c89701 | ||
|
a42a7769e2 | ||
|
202bda884f | ||
|
2315fdc731 | ||
|
b5469bd8a1 | ||
|
6a6318d04c | ||
|
55933f8d40 | ||
|
be6382e0d0 | ||
|
45e771f96b | ||
|
8dde502cc9 | ||
|
3e66767af3 | ||
|
9ec9d1b338 | ||
|
dcad0d7863 | ||
|
94e1aa0baf | ||
|
b62f043910 | ||
|
6ea22d0d21 | ||
|
8c69dc31b8 | ||
|
00734ea87f | ||
|
3009713db4 | ||
|
9b2ceaf37c | ||
|
8019c2ce18 | ||
|
a9e312b8b1 | ||
|
4da3563d8a | ||
|
48d0a3089a | ||
|
594df64b20 | ||
|
06efb5abfc | ||
|
78eb1417a7 | ||
|
8c8f2ad5ee | ||
|
f71e79d10f | ||
|
1b27c5cf06 | ||
|
67edc8f90d | ||
|
5f576b7d0c | ||
|
8b05c788fd | ||
|
236f033bc9 | ||
|
510fc75121 | ||
|
0376f2e6e3 | ||
|
0b65164f60 | ||
|
9be477de33 | ||
|
84f55b84ff | ||
|
ab5c30ad51 | ||
|
0c839453c5 | ||
|
5e4c5d03ae | ||
|
a5b0a1ae62 | ||
|
e9f71ee39b | ||
|
81cdd6385d | ||
|
e76c42329f | ||
|
e6ef4734ea | ||
|
df4bc1d7e9 | ||
|
2b222efa75 | ||
|
6d18e6d840 | ||
|
2a3c63f209 | ||
|
9f70cecaef | ||
|
c08203e2ed | ||
|
86497fd32f | ||
|
3b998573fd | ||
|
e161882ec7 | ||
|
357f349e30 | ||
|
e4769f541d | ||
|
2a173e2861 | ||
|
a6a900266c | ||
|
bdba53f055 | ||
|
bbdde789e7 | ||
|
eab61cd48a | ||
|
0ce2ba9ad9 | ||
|
3ddcebaa36 | ||
|
b91463383e | ||
|
7444a2f36c | ||
|
fdee07048d | ||
|
2fbf201761 | ||
|
4018e4c434 | ||
|
f3382b5bd8 | ||
|
9287ee0141 | ||
|
2769c8f869 | ||
|
ddb66f33ba | ||
|
79500b8fbc | ||
|
187eea43a4 | ||
|
a89ed6fa9f | ||
|
8d168be138 | ||
|
6e1aa7b391 | ||
|
deab9b9516 | ||
|
39d99a906a | ||
|
6f72e6e0d3 | ||
|
d786d79483 | ||
|
01510f6c2e | ||
|
7ba43e9e3f | ||
|
97bfcd1353 | ||
|
aa3c85c196 | ||
|
fb75a3827d | ||
|
7d546d0e2a | ||
|
8fcb6ffd7a | ||
|
f97de0c15a | ||
|
be9e192b78 | ||
|
75ae1c9526 | ||
|
33761a0236 | ||
|
19b69b1764 | ||
|
8b804359a9 | ||
|
f050bf5c4c |
1
.github/FUNDING.yml
vendored
1
.github/FUNDING.yml
vendored
@@ -1,5 +1,6 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
polar: marginalia-search
|
||||
github: MarginaliaSearch
|
||||
patreon: marginalia_nu
|
||||
open_collective: # Replace with a single Open Collective username
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,3 +7,4 @@ build/
|
||||
lombok.config
|
||||
Dockerfile
|
||||
run
|
||||
jte-classes
|
@@ -48,6 +48,7 @@ ext {
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
|
||||
}
|
||||
|
||||
idea {
|
||||
|
@@ -8,17 +8,18 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
@Singleton
|
||||
public class DbDomainQueries {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@Inject
|
||||
@@ -28,7 +29,7 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
|
||||
public Integer getDomainId(EdgeDomain domain) {
|
||||
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
return domainIdCache.get(domain, () -> {
|
||||
@@ -42,6 +43,9 @@ public class DbDomainQueries {
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
@@ -98,4 +102,28 @@ public class DbDomainQueries {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<EdgeDomain> otherSubdomains(EdgeDomain domain, int cnt) {
|
||||
List<EdgeDomain> ret = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
||||
stmt.setString(1, domain.topDomain);
|
||||
stmt.setInt(2, cnt);
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
var sibling = new EdgeDomain(rs.getString(1));
|
||||
|
||||
if (sibling.equals(domain))
|
||||
continue;
|
||||
|
||||
ret.add(sibling);
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to get domain neighbors");
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
@@ -42,6 +42,12 @@ dependencies {
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.flyway
|
||||
|
||||
libs.bundles.jooby.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'org.slf4j'
|
||||
}
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
@@ -7,8 +7,6 @@ import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
@@ -24,7 +22,7 @@ import java.util.function.Function;
|
||||
public class GrpcMultiNodeChannelPool<STUB> {
|
||||
private final ConcurrentHashMap<Integer, GrpcSingleNodeChannelPool<STUB>> pools =
|
||||
new ConcurrentHashMap<>();
|
||||
private static final Logger logger = LoggerFactory.getLogger(GrpcMultiNodeChannelPool.class);
|
||||
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private final ServiceKey<? extends PartitionTraits.Multicast> serviceKey;
|
||||
private final Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor;
|
||||
|
@@ -10,6 +10,8 @@ import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
@@ -26,13 +28,13 @@ import java.util.function.Function;
|
||||
public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
private final Map<InstanceAddress, ConnectionHolder> channels = new ConcurrentHashMap<>();
|
||||
|
||||
private final Marker grpcMarker = MarkerFactory.getMarker("GRPC");
|
||||
private static final Logger logger = LoggerFactory.getLogger(GrpcSingleNodeChannelPool.class);
|
||||
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private final Function<InstanceAddress, ManagedChannel> channelConstructor;
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
|
||||
|
||||
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
|
||||
Function<InstanceAddress, ManagedChannel> channelConstructor,
|
||||
@@ -48,8 +50,6 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
serviceRegistryIf.registerMonitor(this);
|
||||
|
||||
onChange();
|
||||
|
||||
awaitChannel(Duration.ofSeconds(5));
|
||||
}
|
||||
|
||||
|
||||
@@ -62,10 +62,10 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
for (var route : Sets.symmetricDifference(oldRoutes, newRoutes)) {
|
||||
ConnectionHolder oldChannel;
|
||||
if (newRoutes.contains(route)) {
|
||||
logger.info("Adding route {}", route);
|
||||
logger.info(grpcMarker, "Adding route {} => {}", serviceKey, route);
|
||||
oldChannel = channels.put(route, new ConnectionHolder(route));
|
||||
} else {
|
||||
logger.info("Expelling route {}", route);
|
||||
logger.info(grpcMarker, "Expelling route {} => {}", serviceKey, route);
|
||||
oldChannel = channels.remove(route);
|
||||
}
|
||||
if (oldChannel != null) {
|
||||
@@ -103,7 +103,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
|
||||
try {
|
||||
logger.info("Creating channel for {}:{}", serviceKey, address);
|
||||
logger.info(grpcMarker, "Creating channel for {} => {}", serviceKey, address);
|
||||
value = channelConstructor.apply(address);
|
||||
if (channel.compareAndSet(null, value)) {
|
||||
return value;
|
||||
@@ -114,7 +114,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to get channel for " + address, e);
|
||||
logger.error(grpcMarker, "Failed to get channel for " + address, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -206,7 +206,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
|
||||
for (var e : exceptions) {
|
||||
logger.error("Failed to call service {}", serviceKey, e);
|
||||
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
|
||||
}
|
||||
|
||||
throw new ServiceNotAvailableException(serviceKey);
|
||||
|
@@ -4,6 +4,11 @@ import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
|
||||
public class ServiceNotAvailableException extends RuntimeException {
|
||||
public ServiceNotAvailableException(ServiceKey<?> key) {
|
||||
super("Service " + key + " not available");
|
||||
super(key.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public StackTraceElement[] getStackTrace() { // Suppress stack trace
|
||||
return new StackTraceElement[0];
|
||||
}
|
||||
}
|
||||
|
@@ -48,5 +48,10 @@ public record ServiceEndpoint(String host, int port) {
|
||||
public int port() {
|
||||
return endpoint.port();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return endpoint().host() + ":" + endpoint.port() + " [" + instance + "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -48,6 +48,19 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
{
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final String shortName;
|
||||
|
||||
int periodIndex = name.lastIndexOf('.');
|
||||
|
||||
if (periodIndex >= 0) shortName = name.substring(periodIndex+1);
|
||||
else shortName = name;
|
||||
|
||||
return "rest:" + shortName;
|
||||
}
|
||||
|
||||
}
|
||||
record Grpc<P extends ServicePartition>(String name, P partition) implements ServiceKey<P> {
|
||||
public String baseName() {
|
||||
@@ -64,6 +77,18 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
{
|
||||
return new Grpc<>(name, partition);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final String shortName;
|
||||
|
||||
int periodIndex = name.lastIndexOf('.');
|
||||
|
||||
if (periodIndex >= 0) shortName = name.substring(periodIndex+1);
|
||||
else shortName = name;
|
||||
|
||||
return "grpc:" + shortName + "[" + partition.identifier() + "]";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,178 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.jooby.*;
|
||||
import io.prometheus.client.Counter;
|
||||
import nu.marginalia.mq.inbox.MqInboxIf;
|
||||
import nu.marginalia.service.client.ServiceNotAvailableException;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.jte.JteModule;
|
||||
import nu.marginalia.service.server.mq.ServiceMqSubscription;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
public class JoobyService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
// Marker for filtering out sensitive content from the persistent logs
|
||||
private final Marker httpMarker = MarkerFactory.getMarker("HTTP");
|
||||
|
||||
private final Initialization initialization;
|
||||
|
||||
private final static Counter request_counter = Counter.build("wmsa_request_counter", "Request Counter")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_good = Counter.build("wmsa_request_counter_good", "Good Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_bad = Counter.build("wmsa_request_counter_bad", "Bad Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_err = Counter.build("wmsa_request_counter_err", "Error Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final String serviceName;
|
||||
private static volatile boolean initialized = false;
|
||||
|
||||
protected final MqInboxIf messageQueueInbox;
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
private ServiceConfiguration config;
|
||||
private final List<MvcExtension> joobyServices;
|
||||
private final ServiceEndpoint restEndpoint;
|
||||
|
||||
public JoobyService(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices,
|
||||
List<MvcExtension> joobyServices
|
||||
) throws Exception {
|
||||
|
||||
this.joobyServices = joobyServices;
|
||||
this.initialization = params.initialization;
|
||||
config = params.configuration;
|
||||
node = config.node();
|
||||
|
||||
String inboxName = config.serviceName();
|
||||
logger.info("Inbox name: {}", inboxName);
|
||||
|
||||
var serviceRegistry = params.serviceRegistry;
|
||||
|
||||
restEndpoint = serviceRegistry.registerService(ServiceKey.forRest(config.serviceId(), config.node()),
|
||||
config.instanceUuid(), config.externalAddress());
|
||||
|
||||
var mqInboxFactory = params.messageQueueInboxFactory;
|
||||
messageQueueInbox = mqInboxFactory.createSynchronousInbox(inboxName, config.node(), config.instanceUuid());
|
||||
messageQueueInbox.subscribe(new ServiceMqSubscription(this));
|
||||
|
||||
serviceName = System.getProperty("service-name");
|
||||
|
||||
initialization.addCallback(params.heartbeat::start);
|
||||
initialization.addCallback(messageQueueInbox::start);
|
||||
initialization.addCallback(() -> params.eventLog.logEvent("SVC-INIT", serviceName + ":" + config.node()));
|
||||
initialization.addCallback(() -> serviceRegistry.announceInstance(config.instanceUuid()));
|
||||
|
||||
Thread.setDefaultUncaughtExceptionHandler((t, e) -> {
|
||||
if (e instanceof ServiceNotAvailableException) {
|
||||
// reduce log spam for this common case
|
||||
logger.error("Service not available: {}", e.getMessage());
|
||||
}
|
||||
else {
|
||||
logger.error("Uncaught exception", e);
|
||||
}
|
||||
request_counter_err.labels(serviceName, Integer.toString(node)).inc();
|
||||
});
|
||||
|
||||
if (!initialization.isReady() && ! initialized ) {
|
||||
initialized = true;
|
||||
grpcServer = new GrpcServer(config, serviceRegistry, partition, grpcServices);
|
||||
grpcServer.start();
|
||||
}
|
||||
}
|
||||
|
||||
public void startJooby(Jooby jooby) {
|
||||
|
||||
logger.info("{} Listening to {}:{} ({})", getClass().getSimpleName(),
|
||||
restEndpoint.host(),
|
||||
restEndpoint.port(),
|
||||
config.externalAddress());
|
||||
|
||||
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||
|
||||
var options = new ServerOptions();
|
||||
options.setHost(config.bindAddress());
|
||||
options.setPort(restEndpoint.port());
|
||||
|
||||
// Enable gzip compression of response data, but set compression to the lowest level
|
||||
// since it doesn't really save much more space to dial it up. It's typically a
|
||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||
options.setCompressionLevel(1);
|
||||
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
jooby.get("/internal/ping", ctx -> "pong");
|
||||
jooby.get("/internal/started", this::isInitialized);
|
||||
jooby.get("/internal/ready", this::isReady);
|
||||
|
||||
for (var service : joobyServices) {
|
||||
jooby.mvc(service);
|
||||
}
|
||||
|
||||
jooby.before(this::auditRequestIn);
|
||||
jooby.after(this::auditRequestOut);
|
||||
}
|
||||
|
||||
private Object isInitialized(Context ctx) {
|
||||
if (initialization.isReady()) {
|
||||
return "ok";
|
||||
}
|
||||
else {
|
||||
ctx.setResponseCode(StatusCode.FAILED_DEPENDENCY_CODE);
|
||||
return "bad";
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isReady() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private String isReady(Context ctx) {
|
||||
if (isReady()) {
|
||||
return "ok";
|
||||
}
|
||||
else {
|
||||
ctx.setResponseCode(StatusCode.FAILED_DEPENDENCY_CODE);
|
||||
return "bad";
|
||||
}
|
||||
}
|
||||
|
||||
private void auditRequestIn(Context ctx) {
|
||||
request_counter.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
|
||||
private void auditRequestOut(Context ctx, Object result, Throwable failure) {
|
||||
if (ctx.getResponseCode().value() < 400) {
|
||||
request_counter_good.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
else {
|
||||
request_counter_bad.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
|
||||
if (failure != null) {
|
||||
logger.error("Request failed " + ctx.getMethod() + " " + ctx.getRequestURL(), failure);
|
||||
request_counter_err.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -16,7 +16,7 @@ import spark.Spark;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class Service {
|
||||
public class SparkService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
// Marker for filtering out sensitive content from the persistent logs
|
||||
@@ -43,10 +43,10 @@ public class Service {
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
public Service(BaseServiceParams params,
|
||||
Runnable configureStaticFiles,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
public SparkService(BaseServiceParams params,
|
||||
Runnable configureStaticFiles,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
|
||||
this.initialization = params.initialization;
|
||||
var config = params.configuration;
|
||||
@@ -126,18 +126,18 @@ public class Service {
|
||||
}
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
public SparkService(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
SparkService::defaultSparkConfig,
|
||||
partition,
|
||||
grpcServices);
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params) throws Exception {
|
||||
public SparkService(BaseServiceParams params) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
SparkService::defaultSparkConfig,
|
||||
ServicePartition.any(),
|
||||
List.of());
|
||||
}
|
@@ -0,0 +1,61 @@
|
||||
package nu.marginalia.service.server.jte;
|
||||
|
||||
import edu.umd.cs.findbugs.annotations.NonNull;
|
||||
import edu.umd.cs.findbugs.annotations.Nullable;
|
||||
import gg.jte.ContentType;
|
||||
import gg.jte.TemplateEngine;
|
||||
import gg.jte.resolve.DirectoryCodeResolver;
|
||||
import io.jooby.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
// Temporary workaround for a bug
|
||||
// APL-2.0 https://github.com/jooby-project/jooby
|
||||
public class JteModule implements Extension {
|
||||
private Path sourceDirectory;
|
||||
private Path classDirectory;
|
||||
private TemplateEngine templateEngine;
|
||||
|
||||
public JteModule(@NonNull Path sourceDirectory, @NonNull Path classDirectory) {
|
||||
this.sourceDirectory = (Path)Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
this.classDirectory = (Path)Objects.requireNonNull(classDirectory, "Class directory is required.");
|
||||
}
|
||||
|
||||
public JteModule(@NonNull Path sourceDirectory) {
|
||||
this.sourceDirectory = (Path)Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
}
|
||||
|
||||
public JteModule(@NonNull TemplateEngine templateEngine) {
|
||||
this.templateEngine = (TemplateEngine)Objects.requireNonNull(templateEngine, "Template engine is required.");
|
||||
}
|
||||
|
||||
public void install(@NonNull Jooby application) {
|
||||
if (this.templateEngine == null) {
|
||||
this.templateEngine = create(application.getEnvironment(), this.sourceDirectory, this.classDirectory);
|
||||
}
|
||||
|
||||
ServiceRegistry services = application.getServices();
|
||||
services.put(TemplateEngine.class, this.templateEngine);
|
||||
application.encoder(MediaType.html, new JteTemplateEngine(this.templateEngine));
|
||||
}
|
||||
|
||||
public static TemplateEngine create(@NonNull Environment environment, @NonNull Path sourceDirectory, @Nullable Path classDirectory) {
|
||||
boolean dev = environment.isActive("dev", new String[]{"test"});
|
||||
if (dev) {
|
||||
Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
Path requiredClassDirectory = (Path)Optional.ofNullable(classDirectory).orElseGet(() -> sourceDirectory.resolve("jte-classes"));
|
||||
TemplateEngine engine = TemplateEngine.create(new DirectoryCodeResolver(sourceDirectory), requiredClassDirectory, ContentType.Html, environment.getClassLoader());
|
||||
Optional<List<String>> var10000 = Optional.ofNullable(System.getProperty("jooby.run.classpath")).map((it) -> it.split(File.pathSeparator)).map(Stream::of).map(Stream::toList);
|
||||
Objects.requireNonNull(engine);
|
||||
var10000.ifPresent(engine::setClassPath);
|
||||
return engine;
|
||||
} else {
|
||||
return classDirectory == null ? TemplateEngine.createPrecompiled(ContentType.Html) : TemplateEngine.createPrecompiled(classDirectory, ContentType.Html);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,48 @@
|
||||
package nu.marginalia.service.server.jte;
|
||||
|
||||
import edu.umd.cs.findbugs.annotations.NonNull;
|
||||
import gg.jte.TemplateEngine;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import io.jooby.buffer.DataBuffer;
|
||||
import io.jooby.internal.jte.DataBufferOutput;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
// Temporary workaround for a bug
|
||||
// APL-2.0 https://github.com/jooby-project/jooby
|
||||
class JteTemplateEngine implements io.jooby.TemplateEngine {
|
||||
private final TemplateEngine jte;
|
||||
private final List<String> extensions;
|
||||
|
||||
public JteTemplateEngine(TemplateEngine jte) {
|
||||
this.jte = jte;
|
||||
this.extensions = List.of(".jte", ".kte");
|
||||
}
|
||||
|
||||
|
||||
@NonNull @Override
|
||||
public List<String> extensions() {
|
||||
return extensions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DataBuffer render(Context ctx, ModelAndView modelAndView) {
|
||||
var buffer = ctx.getBufferFactory().allocateBuffer();
|
||||
var output = new DataBufferOutput(buffer, StandardCharsets.UTF_8);
|
||||
var attributes = ctx.getAttributes();
|
||||
if (modelAndView instanceof MapModelAndView mapModelAndView) {
|
||||
var mapModel = new HashMap<String, Object>();
|
||||
mapModel.putAll(attributes);
|
||||
mapModel.putAll(mapModelAndView.getModel());
|
||||
jte.render(modelAndView.getView(), mapModel, output);
|
||||
} else {
|
||||
jte.render(modelAndView.getView(), modelAndView.getModel(), output);
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
}
|
@@ -3,7 +3,6 @@ package nu.marginalia.service.server.mq;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSubscription;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -15,10 +14,10 @@ import java.util.Map;
|
||||
public class ServiceMqSubscription implements MqSubscription {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ServiceMqSubscription.class);
|
||||
private final Map<String, Method> requests = new HashMap<>();
|
||||
private final Service service;
|
||||
private final Object service;
|
||||
|
||||
|
||||
public ServiceMqSubscription(Service service) {
|
||||
public ServiceMqSubscription(Object service) {
|
||||
this.service = service;
|
||||
|
||||
/* Wire up all methods annotated with @MqRequest and @MqNotification
|
||||
|
@@ -6,4 +6,8 @@ public record BrowseResultSet(Collection<BrowseResult> results, String focusDoma
|
||||
public BrowseResultSet(Collection<BrowseResult> results) {
|
||||
this(results, "");
|
||||
}
|
||||
|
||||
public boolean hasFocusDomain() {
|
||||
return focusDomain != null && !focusDomain.isBlank();
|
||||
}
|
||||
}
|
||||
|
@@ -38,6 +38,7 @@ public class DomainsProtobufCodec {
|
||||
sd.getIndexed(),
|
||||
sd.getActive(),
|
||||
sd.getScreenshot(),
|
||||
sd.getFeed(),
|
||||
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
|
||||
);
|
||||
}
|
||||
|
@@ -71,6 +71,23 @@ public class DomainInformation {
|
||||
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
|
||||
}
|
||||
|
||||
public String getAsnFlag() {
|
||||
if (asnCountry == null || asnCountry.codePointCount(0, asnCountry.length()) != 2) {
|
||||
return "";
|
||||
}
|
||||
String country = asnCountry;
|
||||
|
||||
if ("UK".equals(country)) {
|
||||
country = "GB";
|
||||
}
|
||||
|
||||
int offset = 0x1F1E6;
|
||||
int asciiOffset = 0x41;
|
||||
int firstChar = Character.codePointAt(country, 0) - asciiOffset + offset;
|
||||
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
|
||||
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
@@ -9,6 +9,7 @@ public record SimilarDomain(EdgeUrl url,
|
||||
boolean indexed,
|
||||
boolean active,
|
||||
boolean screenshot,
|
||||
boolean feed,
|
||||
LinkType linkType) {
|
||||
|
||||
public String getRankSymbols() {
|
||||
@@ -52,12 +53,12 @@ public record SimilarDomain(EdgeUrl url,
|
||||
return NONE;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
public String faIcon() {
|
||||
return switch (this) {
|
||||
case FOWARD -> "→";
|
||||
case BACKWARD -> "←";
|
||||
case BIDIRECTIONAL -> "⇆";
|
||||
case NONE -> "-";
|
||||
case FOWARD -> "fa-solid fa-arrow-right";
|
||||
case BACKWARD -> "fa-solid fa-arrow-left";
|
||||
case BIDIRECTIONAL -> "fa-solid fa-arrow-right-arrow-left";
|
||||
case NONE -> "";
|
||||
};
|
||||
}
|
||||
|
||||
|
@@ -101,6 +101,7 @@ message RpcSimilarDomain {
|
||||
bool active = 6;
|
||||
bool screenshot = 7;
|
||||
LINK_TYPE linkType = 8;
|
||||
bool feed = 9;
|
||||
|
||||
enum LINK_TYPE {
|
||||
BACKWARD = 0;
|
||||
|
@@ -9,6 +9,7 @@ import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.set.TIntSet;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.domains.RpcSimilarDomain;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
@@ -17,10 +18,14 @@ import org.roaringbitmap.RoaringBitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -32,12 +37,13 @@ public class SimilarDomainsService {
|
||||
private final HikariDataSource dataSource;
|
||||
private final AggregateLinkGraphClient linkGraphClient;
|
||||
|
||||
private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
|
||||
private final TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
|
||||
private volatile int[] domainIdxToId;
|
||||
|
||||
public volatile Int2DoubleArrayMap[] relatedDomains;
|
||||
public volatile TIntList[] domainNeighbors = null;
|
||||
public volatile RoaringBitmap screenshotDomains = null;
|
||||
public volatile RoaringBitmap feedDomains = null;
|
||||
public volatile RoaringBitmap activeDomains = null;
|
||||
public volatile RoaringBitmap indexedDomains = null;
|
||||
public volatile TIntDoubleHashMap domainRanks = null;
|
||||
@@ -82,6 +88,7 @@ public class SimilarDomainsService {
|
||||
domainNames = new String[domainIdToIdx.size()];
|
||||
domainNeighbors = new TIntList[domainIdToIdx.size()];
|
||||
screenshotDomains = new RoaringBitmap();
|
||||
feedDomains = new RoaringBitmap();
|
||||
activeDomains = new RoaringBitmap();
|
||||
indexedDomains = new RoaringBitmap();
|
||||
relatedDomains = new Int2DoubleArrayMap[domainIdToIdx.size()];
|
||||
@@ -145,10 +152,12 @@ public class SimilarDomainsService {
|
||||
activeDomains.add(idx);
|
||||
}
|
||||
|
||||
updateScreenshotInfo();
|
||||
|
||||
logger.info("Loaded {} domains", domainRanks.size());
|
||||
isReady = true;
|
||||
|
||||
// We can defer these as they only populate a roaringbitmap, and will degrade gracefully when not complete
|
||||
updateScreenshotInfo();
|
||||
updateFeedInfo();
|
||||
}
|
||||
}
|
||||
catch (SQLException throwables) {
|
||||
@@ -156,6 +165,42 @@ public class SimilarDomainsService {
|
||||
}
|
||||
}
|
||||
|
||||
private void updateFeedInfo() {
|
||||
Set<String> feedsDomainNames = new HashSet<>(500_000);
|
||||
Path readerDbPath = WmsaHome.getDataPath().resolve("rss-feeds.db").toAbsolutePath();
|
||||
String dbUrl = "jdbc:sqlite:" + readerDbPath;
|
||||
|
||||
logger.info("Opening feed db at " + dbUrl);
|
||||
|
||||
try (var conn = DriverManager.getConnection(dbUrl);
|
||||
var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery("""
|
||||
select
|
||||
json_extract(feed, '$.domain') as domain
|
||||
from feed
|
||||
where json_array_length(feed, '$.items') > 0
|
||||
""");
|
||||
while (rs.next()) {
|
||||
feedsDomainNames.add(rs.getString(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to read RSS feed items", ex);
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < domainNames.length; idx++) {
|
||||
String name = domainNames[idx];
|
||||
if (name == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (feedsDomainNames.contains(name)) {
|
||||
feedDomains.add(idx);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void updateScreenshotInfo() {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.createStatement()) {
|
||||
@@ -254,6 +299,7 @@ public class SimilarDomainsService {
|
||||
.setIndexed(indexedDomains.contains(idx))
|
||||
.setActive(activeDomains.contains(idx))
|
||||
.setScreenshot(screenshotDomains.contains(idx))
|
||||
.setFeed(feedDomains.contains(idx))
|
||||
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
|
||||
.build());
|
||||
|
||||
@@ -369,6 +415,7 @@ public class SimilarDomainsService {
|
||||
.setIndexed(indexedDomains.contains(idx))
|
||||
.setActive(activeDomains.contains(idx))
|
||||
.setScreenshot(screenshotDomains.contains(idx))
|
||||
.setFeed(feedDomains.contains(idx))
|
||||
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
|
||||
.build());
|
||||
|
||||
|
@@ -5,6 +5,7 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.livecapture.LiveCaptureApiGrpc.LiveCaptureApiBlockingStub;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.client.ServiceNotAvailableException;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
@@ -29,6 +30,9 @@ public class LiveCaptureClient {
|
||||
channelPool.call(LiveCaptureApiBlockingStub::requestScreengrab)
|
||||
.run(RpcDomainId.newBuilder().setDomainId(domainId).build());
|
||||
}
|
||||
catch (ServiceNotAvailableException e) {
|
||||
logger.info("requestScreengrab() failed since the service is not available");
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("API Exception", e);
|
||||
}
|
||||
|
@@ -27,7 +27,7 @@ dependencies {
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation libs.rssreader
|
||||
implementation project(':third-party:rssreader')
|
||||
implementation libs.opencsv
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.slf4j
|
||||
|
27
code/functions/live-capture/test-resources/nlnet.atom
Normal file
27
code/functions/live-capture/test-resources/nlnet.atom
Normal file
@@ -0,0 +1,27 @@
|
||||
<feed xmlns="http://www.w3.org/2005/Atom" xml:base="https://nlnet.nl">
|
||||
<title type="text">NLnet news</title>
|
||||
<updated>2025-01-01T00:00:00Z</updated>
|
||||
<id>https://nlnet.nl/feed.atom</id>
|
||||
<link rel="self" type="application/atom+xml" href="https://nlnet.nl/feed.atom"/>
|
||||
<entry>
|
||||
<id>https://nlnet.nl/news/2025/20250101-announcing-grantees-June-call.html</id>
|
||||
<author>
|
||||
<name>NLnet</name>
|
||||
</author>
|
||||
<title type="xhtml">
|
||||
<div xmlns="http://www.w3.org/1999/xhtml">50 Free and Open Source Projects Selected for NGI Zero grants</div>
|
||||
</title>
|
||||
<link href="/news/2025/20250101-announcing-grantees-June-call.html"/>
|
||||
<updated>2025-01-01T00:00:00Z</updated>
|
||||
<content type="xhtml">
|
||||
<div xmlns="http://www.w3.org/1999/xhtml">
|
||||
<p class="paralead">Happy 2025 everyone! On this first day of the fresh new year we are happy to announce 50 project teams were selected to receive NGI Zero grants. We are welcoming projects from 18 countries involving people and organisations of various types: individuals, associations, small and medium enterprises, foundations, universities, and informal collectives. The new projects are all across the different layers of the NGI technology stack: from trustworthy open hardware to services & applications which provide autonomy for end-users.</p>
|
||||
<p>The 50 free and open source projects were selected across two funds. 19 teams will receive grants from the <a href="/commonsfund/">NGI Zero Commons Fund</a>, a broadly themed fund that supports people working on reclaiming the public nature of the internet. The other 31 projects will work within <a href="/core/">NGI Zero Core</a> which focuses on strengthening the open internet architecture. Both funds offer financial and practical support. The latter consisting of <a href="/NGI0/services/">support services</a> such as accessibility and security audits, advice on license compliance, help with testing, documentation or UX design.</p>
|
||||
<h2>If you applied for a grant</h2>
|
||||
<p>This is the selection for the <a href="https://nlnet.nl/news/2024/20240401-call.html">June call</a>. We always inform <em>all</em> applicants about the outcome of the review ahead of the public announcement, if the are selected or not. If you have not heard anything, you probably applied to a later call that is still under review. You can see which call you applied to by checking the application number assigned to the project when you applied. The second number in the sequence refers to the month of the call, so 06 in the case of the June call. (It should not happen, but if you did apply to the June call and did not hear anything, do contact us.)</p>
|
||||
<h2>Meet the new projects!</h2>
|
||||
</div>
|
||||
</content>
|
||||
</entry>
|
||||
|
||||
</feed>
|
@@ -1,8 +1,13 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.apptasticsoftware.rssreader.Item;
|
||||
import com.apptasticsoftware.rssreader.RssReader;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class TestXmlSanitization {
|
||||
|
||||
@Test
|
||||
@@ -10,10 +15,24 @@ public class TestXmlSanitization {
|
||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||
Assertions.assertEquals(""", FeedFetcherService.sanitizeEntities("""));
|
||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNlnetTitleTag() {
|
||||
// The NLnet atom feed puts HTML tags in the entry/title tags, which breaks the vanilla RssReader code
|
||||
|
||||
// Verify we're able to consume and strip out the HTML tags
|
||||
RssReader r = new RssReader();
|
||||
|
||||
List<Item> items = r.read(ClassLoader.getSystemResourceAsStream("nlnet.atom")).toList();
|
||||
|
||||
Assertions.assertEquals(1, items.size());
|
||||
for (var item : items) {
|
||||
Assertions.assertEquals(Optional.of("50 Free and Open Source Projects Selected for NGI Zero grants"), item.getTitle());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStrayAmpersand() {
|
||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
||||
@@ -23,4 +42,9 @@ public class TestXmlSanitization {
|
||||
public void testTranslatedHtmlEntity() {
|
||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTranslatedHtmlEntityQuot() {
|
||||
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob""));
|
||||
}
|
||||
}
|
||||
|
@@ -7,4 +7,8 @@ public record DictionaryResponse(String word, List<DictionaryEntry> entries) {
|
||||
this.word = word;
|
||||
this.entries = entries.stream().toList(); // Make an immutable copy
|
||||
}
|
||||
|
||||
public boolean hasEntries() {
|
||||
return !entries.isEmpty();
|
||||
}
|
||||
}
|
||||
|
@@ -9,10 +9,9 @@ import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.time.Duration;
|
||||
|
||||
@Singleton
|
||||
public class QueryClient {
|
||||
@@ -24,13 +23,14 @@ public class QueryClient {
|
||||
|
||||
private final GrpcSingleNodeChannelPool<QueryApiGrpc.QueryApiBlockingStub> queryApiPool;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public QueryClient(GrpcChannelPoolFactory channelPoolFactory) {
|
||||
public QueryClient(GrpcChannelPoolFactory channelPoolFactory) throws InterruptedException {
|
||||
this.queryApiPool = channelPoolFactory.createSingle(
|
||||
ServiceKey.forGrpcApi(QueryApiGrpc.class, ServicePartition.any()),
|
||||
QueryApiGrpc::newBlockingStub);
|
||||
|
||||
// Hold up initialization until we have a downstream connection
|
||||
this.queryApiPool.awaitChannel(Duration.ofSeconds(5));
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
|
@@ -71,6 +71,17 @@ public class QueryFactory {
|
||||
|
||||
String[] parts = StringUtils.split(str, '_');
|
||||
|
||||
// Trim down tokens to match the behavior of the tokenizer used in indexing
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
String part = parts[i];
|
||||
|
||||
if (part.endsWith("'s") && part.length() > 2) {
|
||||
part = part.substring(0, part.length()-2);
|
||||
}
|
||||
|
||||
parts[i] = part;
|
||||
}
|
||||
|
||||
if (parts.length > 1) {
|
||||
// Require that the terms appear in sequence
|
||||
queryBuilder.phraseConstraint(SearchPhraseConstraint.mandatory(parts));
|
||||
|
@@ -155,16 +155,25 @@ public class QueryParser {
|
||||
|
||||
// Remove trailing punctuation
|
||||
int lastChar = str.charAt(str.length() - 1);
|
||||
if (":.,!?$'".indexOf(lastChar) >= 0)
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr()));
|
||||
if (":.,!?$'".indexOf(lastChar) >= 0) {
|
||||
str = str.substring(0, str.length() - 1);
|
||||
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
|
||||
}
|
||||
|
||||
// Remove term elements that aren't indexed by the search engine
|
||||
if (str.endsWith("'s"))
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr()));
|
||||
if (str.endsWith("()"))
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr()));
|
||||
if (str.startsWith("$"))
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(1), lt.displayStr()));
|
||||
if (str.endsWith("'s")) {
|
||||
str = str.substring(0, str.length() - 2);
|
||||
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
|
||||
}
|
||||
if (str.endsWith("()")) {
|
||||
str = str.substring(0, str.length() - 2);
|
||||
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
|
||||
}
|
||||
|
||||
while (str.startsWith("$") || str.startsWith("_")) {
|
||||
str = str.substring(1);
|
||||
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
|
||||
}
|
||||
|
||||
if (entity.isBlank()) {
|
||||
entity.remove();
|
||||
@@ -224,9 +233,19 @@ public class QueryParser {
|
||||
entity.replace(new QueryToken.RankTerm(limit, str));
|
||||
} else if (str.startsWith("qs=")) {
|
||||
entity.replace(new QueryToken.QsTerm(str.substring(3)));
|
||||
} else if (str.contains(":")) {
|
||||
} else if (str.startsWith("site:")
|
||||
|| str.startsWith("format:")
|
||||
|| str.startsWith("file:")
|
||||
|| str.startsWith("tld:")
|
||||
|| str.startsWith("ip:")
|
||||
|| str.startsWith("as:")
|
||||
|| str.startsWith("asn:")
|
||||
|| str.startsWith("generator:")
|
||||
)
|
||||
{
|
||||
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static SpecificationLimit parseSpecificationLimit(String str) {
|
||||
|
@@ -0,0 +1,32 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
class QueryParserTest {
|
||||
|
||||
@Test
|
||||
// https://github.com/MarginaliaSearch/MarginaliaSearch/issues/140
|
||||
void parse__builtin_ffs() {
|
||||
QueryParser parser = new QueryParser();
|
||||
var tokens = parser.parse("__builtin_ffs");
|
||||
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("builtin_ffs", "__builtin_ffs")), tokens);
|
||||
}
|
||||
|
||||
@Test
|
||||
void trailingParens() {
|
||||
QueryParser parser = new QueryParser();
|
||||
var tokens = parser.parse("strcpy()");
|
||||
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("strcpy", "strcpy()")), tokens);
|
||||
}
|
||||
|
||||
@Test
|
||||
void trailingQuote() {
|
||||
QueryParser parser = new QueryParser();
|
||||
var tokens = parser.parse("bob's");
|
||||
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("bob", "bob's")), tokens);
|
||||
}
|
||||
}
|
@@ -208,6 +208,23 @@ public class QueryFactoryTest {
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCplusPlus() {
|
||||
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuotedApostrophe() {
|
||||
var subquery = parseAndGetSpecs("\"bob's cars\"");
|
||||
|
||||
System.out.println(subquery);
|
||||
|
||||
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
|
||||
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
|
||||
Assertions.assertEquals("\"bob's cars\"", subquery.humanQuery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpansion9() {
|
||||
var subquery = parseAndGetSpecs("pie recipe");
|
||||
|
@@ -27,7 +27,7 @@ public class SentenceSegmentSplitter {
|
||||
else {
|
||||
// If we flatten unicode, we do this...
|
||||
// FIXME: This can almost definitely be cleaned up and simplified.
|
||||
wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
wordBreakPattern = Pattern.compile("([^/<>$:_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,12 +90,17 @@ public class SentenceSegmentSplitter {
|
||||
for (int i = 0; i < ret.size(); i++) {
|
||||
String part = ret.get(i);
|
||||
|
||||
if (part.startsWith("<") && part.endsWith(">") && part.length() > 2) {
|
||||
ret.set(i, part.substring(1, part.length() - 1));
|
||||
}
|
||||
|
||||
if (part.startsWith("'") && part.length() > 1) {
|
||||
ret.set(i, part.substring(1));
|
||||
}
|
||||
if (part.endsWith("'") && part.length() > 1) {
|
||||
ret.set(i, part.substring(0, part.length()-1));
|
||||
}
|
||||
|
||||
while (part.endsWith(".")) {
|
||||
part = part.substring(0, part.length()-1);
|
||||
ret.set(i, part);
|
||||
|
@@ -28,6 +28,20 @@ class SentenceExtractorTest {
|
||||
System.out.println(dld);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testCplusplus() {
|
||||
var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals(1, dld.length());
|
||||
assertEquals("std::vector", dld.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPHP() {
|
||||
var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals(1, dld.length());
|
||||
assertEquals("$_get", dld.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPolishArtist() {
|
||||
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));
|
||||
|
@@ -152,7 +152,10 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
|
||||
boolean matchesWordPattern(String s) {
|
||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
||||
if (s.length() > 48)
|
||||
return false;
|
||||
|
||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,8}
|
||||
|
||||
String wordPartSeparator = ".-_/:+*";
|
||||
|
||||
@@ -169,7 +172,7 @@ public class DocumentPositionMapper {
|
||||
if (i == 0)
|
||||
return false;
|
||||
|
||||
for (int j = 0; j < 5; j++) {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
|
@@ -30,9 +30,11 @@ class DocumentPositionMapperTest {
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef"));
|
||||
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test-test-test-test"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector::push_back"));
|
||||
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("c++"));
|
||||
Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h"));
|
||||
Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));
|
||||
|
@@ -0,0 +1,113 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class CppreferenceSpecialization extends WikiSpecialization {
|
||||
|
||||
@Inject
|
||||
public CppreferenceSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
|
||||
super(summaryExtractor, titleExtractor);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document prune(Document original) {
|
||||
var doc = original.clone();
|
||||
|
||||
doc.getElementsByClass("t-nv").remove();
|
||||
doc.getElementsByClass("toc").remove();
|
||||
doc.getElementsByClass("mw-head").remove();
|
||||
doc.getElementsByClass("printfooter").remove();
|
||||
doc.getElementsByClass("cpp-footer-base").remove();
|
||||
|
||||
doc.title(doc.title() + " " + Strings.join(extractExtraTokens(doc.title()), ' '));
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSummary(Document doc, Set<String> importantWords) {
|
||||
|
||||
Element declTable = doc.getElementsByClass("t-dcl-begin").first();
|
||||
if (declTable != null) {
|
||||
var nextPar = declTable.nextElementSibling();
|
||||
if (nextPar != null) {
|
||||
return nextPar.text();
|
||||
}
|
||||
}
|
||||
|
||||
return super.getSummary(doc, importantWords);
|
||||
}
|
||||
|
||||
|
||||
public List<String> extractExtraTokens(String title) {
|
||||
|
||||
if (!title.contains("::")) {
|
||||
return List.of();
|
||||
}
|
||||
if (!title.contains("-")) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
title = StringUtils.split(title, '-')[0];
|
||||
|
||||
String name = title;
|
||||
for (;;) {
|
||||
int lbidx = name.indexOf('<');
|
||||
int rbidx = name.indexOf('>');
|
||||
|
||||
if (lbidx > 0 && rbidx > lbidx) {
|
||||
String className = name.substring(0, lbidx);
|
||||
String methodName = name.substring(rbidx + 1);
|
||||
name = className + methodName;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
List<String> tokens = new ArrayList<>();
|
||||
|
||||
for (var part : name.split("\\s*,\\s*")) {
|
||||
if (part.endsWith(")") && !part.endsWith("()")) {
|
||||
int parenStart = part.indexOf('(');
|
||||
if (parenStart > 0) { // foo(...) -> foo
|
||||
part = part.substring(0, parenStart);
|
||||
}
|
||||
else if (parenStart == 0) { // (foo) -> foo
|
||||
part = part.substring(1, part.length() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
part = part.trim();
|
||||
if (part.contains("::")) {
|
||||
tokens.add(part);
|
||||
if (part.startsWith("std::")) {
|
||||
tokens.add(part.substring(5));
|
||||
|
||||
int ss = part.indexOf("::", 5);
|
||||
if (ss > 0) {
|
||||
tokens.add(part.substring(0, ss));
|
||||
tokens.add(part.substring(ss+2));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -24,6 +24,7 @@ public class HtmlProcessorSpecializations {
|
||||
private final WikiSpecialization wikiSpecialization;
|
||||
private final BlogSpecialization blogSpecialization;
|
||||
private final GogStoreSpecialization gogStoreSpecialization;
|
||||
private final CppreferenceSpecialization cppreferenceSpecialization;
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
|
||||
@Inject
|
||||
@@ -37,6 +38,7 @@ public class HtmlProcessorSpecializations {
|
||||
WikiSpecialization wikiSpecialization,
|
||||
BlogSpecialization blogSpecialization,
|
||||
GogStoreSpecialization gogStoreSpecialization,
|
||||
CppreferenceSpecialization cppreferenceSpecialization,
|
||||
DefaultSpecialization defaultSpecialization) {
|
||||
this.domainTypes = domainTypes;
|
||||
this.lemmySpecialization = lemmySpecialization;
|
||||
@@ -48,6 +50,7 @@ public class HtmlProcessorSpecializations {
|
||||
this.wikiSpecialization = wikiSpecialization;
|
||||
this.blogSpecialization = blogSpecialization;
|
||||
this.gogStoreSpecialization = gogStoreSpecialization;
|
||||
this.cppreferenceSpecialization = cppreferenceSpecialization;
|
||||
this.defaultSpecialization = defaultSpecialization;
|
||||
}
|
||||
|
||||
@@ -66,6 +69,10 @@ public class HtmlProcessorSpecializations {
|
||||
return mariadbKbSpecialization;
|
||||
}
|
||||
|
||||
if (url.domain.getTopDomain().equals("cppreference.com")) {
|
||||
return cppreferenceSpecialization;
|
||||
}
|
||||
|
||||
if (url.domain.toString().equals("store.steampowered.com")) {
|
||||
return steamStoreSpecialization;
|
||||
}
|
||||
@@ -86,6 +93,9 @@ public class HtmlProcessorSpecializations {
|
||||
if (generator.keywords().contains("javadoc")) {
|
||||
return javadocSpecialization;
|
||||
}
|
||||
|
||||
// Must be toward the end, as some specializations are for
|
||||
// wiki-generator content
|
||||
if (generator.type() == GeneratorType.WIKI) {
|
||||
return wikiSpecialization;
|
||||
}
|
||||
@@ -105,7 +115,7 @@ public class HtmlProcessorSpecializations {
|
||||
|
||||
boolean shouldIndex(EdgeUrl url);
|
||||
double lengthModifier();
|
||||
void amendWords(Document doc, DocumentKeywordsBuilder words);
|
||||
|
||||
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
|
||||
}
|
||||
}
|
||||
|
@@ -4,7 +4,6 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -93,6 +92,8 @@ public class WikiSpecialization extends DefaultSpecialization {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void amendWords(Document doc, DocumentKeywordsBuilder words) {
|
||||
@Override
|
||||
public double lengthModifier() {
|
||||
return 2.5;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,27 @@
|
||||
package nu.marginalia.converting.processor.plugin.specialization;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
class CppreferenceSpecializationTest {
|
||||
CppreferenceSpecialization specialization = new CppreferenceSpecialization(null, null);
|
||||
|
||||
@Test
|
||||
public void testTitleMagic() {
|
||||
|
||||
List<String> ret;
|
||||
|
||||
ret = specialization.extractExtraTokens("std::multimap<Key, T, Compare, Allocator>::crend - cppreference.com");
|
||||
Assertions.assertTrue(ret.contains("std::multimap::crend"));
|
||||
Assertions.assertTrue(ret.contains("multimap::crend"));
|
||||
Assertions.assertTrue(ret.contains("std::multimap"));
|
||||
Assertions.assertTrue(ret.contains("crend"));
|
||||
|
||||
ret = specialization.extractExtraTokens("std::coroutine_handle<Promise>::operator(), std::coroutine_handle<Promise>::resume - cppreference.com");
|
||||
Assertions.assertTrue(ret.contains("std::coroutine_handle::operator()"));
|
||||
Assertions.assertTrue(ret.contains("std::coroutine_handle::resume"));
|
||||
}
|
||||
|
||||
}
|
@@ -11,7 +11,7 @@ import nu.marginalia.api.svc.RateLimiterService;
|
||||
import nu.marginalia.api.svc.ResponseCache;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import nu.marginalia.service.server.mq.MqRequest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -21,7 +21,7 @@ import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
public class ApiService extends Service {
|
||||
public class ApiService extends SparkService {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = GsonFactory.get();
|
||||
@@ -69,7 +69,7 @@ public class ApiService extends Service {
|
||||
this.searchOperator = searchOperator;
|
||||
|
||||
Spark.get("/api/", (rq, rsp) -> {
|
||||
rsp.redirect("https://memex.marginalia.nu/projects/edge/api.gmi");
|
||||
rsp.redirect("https://about.marginalia-search.com/article/api/");
|
||||
return "";
|
||||
});
|
||||
|
||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
@@ -18,7 +18,7 @@ import spark.Spark;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class DatingService extends Service {
|
||||
public class DatingService extends SparkService {
|
||||
private final DomainBlacklist blacklist;
|
||||
private final DbBrowseDomainsSimilarCosine browseSimilarCosine;
|
||||
private final DbBrowseDomainsRandom browseRandom;
|
||||
|
@@ -5,7 +5,7 @@ import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import nu.marginalia.service.server.StaticResources;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import spark.Request;
|
||||
@@ -15,7 +15,7 @@ import spark.Spark;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class ExplorerService extends Service {
|
||||
public class ExplorerService extends SparkService {
|
||||
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
private final HikariDataSource dataSource;
|
||||
|
94
code/services-application/search-service-legacy/build.gradle
Normal file
94
code/services-application/search-service-legacy/build.gradle
Normal file
@@ -0,0 +1,94 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'io.freefair.sass-base' version '8.4'
|
||||
id 'io.freefair.sass-java' version '8.4'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.search.SearchMain'
|
||||
applicationName = 'search-service-legacy'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
sass {
|
||||
sourceMapEnabled = true
|
||||
sourceMapEmbed = true
|
||||
outputStyle = EXPANDED
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
apply from: "$rootProject.projectDir/docker.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:index:query')
|
||||
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:braille-block-punch-cards')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math:api')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:common:renderer')
|
||||
|
||||
implementation project(':code:features-search:screenshots')
|
||||
implementation project(':code:features-search:random-websites')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.roaringbitmap
|
||||
implementation libs.prometheus
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.handlebars
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
exclude group: 'org.eclipse.jetty'
|
||||
}
|
||||
implementation libs.bundles.jetty
|
||||
implementation libs.opencsv
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.nlp
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
||||
|
||||
tasks.register('paperDoll', Test) {
|
||||
useJUnitPlatform {
|
||||
includeTags "paperdoll"
|
||||
}
|
||||
jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ]
|
||||
}
|
@@ -0,0 +1,47 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.service.MainClass;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import spark.Spark;
|
||||
|
||||
public class SearchMain extends MainClass {
|
||||
private final SearchService service;
|
||||
|
||||
@Inject
|
||||
public SearchMain(SearchService service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
public static void main(String... args) {
|
||||
|
||||
init(ServiceId.Search, args);
|
||||
|
||||
Spark.staticFileLocation("/static/search/");
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new SearchModule(),
|
||||
new ServiceConfigurationModule(ServiceId.Search),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
|
||||
// Orchestrate the boot order for the services
|
||||
var registry = injector.getInstance(ServiceRegistryIf.class);
|
||||
var configuration = injector.getInstance(ServiceConfiguration.class);
|
||||
orchestrateBoot(registry, configuration);
|
||||
|
||||
injector.getInstance(SearchMain.class);
|
||||
injector.getInstance(Initialization.class).setReady();
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,20 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
||||
|
||||
public class SearchModule extends AbstractModule {
|
||||
|
||||
public void configure() {
|
||||
bind(HandlebarsConfigurator.class).to(SearchHandlebarsConfigurator.class);
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
|
||||
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(
|
||||
System.getProperty("search.legacyWebsiteUrl", "https://old-search.marginalia.nu/")));
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,266 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.ClusteredUrlDetails;
|
||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.search.model.SearchFilters;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.results.UrlDeduplicator;
|
||||
import nu.marginalia.search.svc.SearchQueryCountService;
|
||||
import nu.marginalia.search.svc.SearchUnitConversionService;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
@Singleton
|
||||
public class SearchOperator {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchOperator.class);
|
||||
|
||||
// Marker for filtering out sensitive content from the persistent logs
|
||||
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
|
||||
|
||||
private final MathClient mathClient;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final QueryClient queryClient;
|
||||
private final SearchQueryParamFactory paramFactory;
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final SearchUnitConversionService searchUnitConversionService;
|
||||
private final SearchQueryCountService searchVisitorCount;
|
||||
|
||||
|
||||
@Inject
|
||||
public SearchOperator(MathClient mathClient,
|
||||
DbDomainQueries domainQueries,
|
||||
QueryClient queryClient,
|
||||
SearchQueryParamFactory paramFactory,
|
||||
WebsiteUrl websiteUrl,
|
||||
SearchUnitConversionService searchUnitConversionService,
|
||||
SearchQueryCountService searchVisitorCount
|
||||
)
|
||||
{
|
||||
|
||||
this.mathClient = mathClient;
|
||||
this.domainQueries = domainQueries;
|
||||
this.queryClient = queryClient;
|
||||
this.paramFactory = paramFactory;
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.searchUnitConversionService = searchUnitConversionService;
|
||||
this.searchVisitorCount = searchVisitorCount;
|
||||
}
|
||||
|
||||
public List<UrlDetails> doSiteSearch(String domain,
|
||||
int domainId,
|
||||
int count) {
|
||||
|
||||
var queryParams = paramFactory.forSiteSearch(domain, domainId, count);
|
||||
var queryResponse = queryClient.search(queryParams);
|
||||
|
||||
return getResultsFromQuery(queryResponse);
|
||||
}
|
||||
|
||||
public List<UrlDetails> doBacklinkSearch(String domain) {
|
||||
|
||||
var queryParams = paramFactory.forBacklinkSearch(domain);
|
||||
var queryResponse = queryClient.search(queryParams);
|
||||
|
||||
return getResultsFromQuery(queryResponse);
|
||||
}
|
||||
|
||||
public List<UrlDetails> doLinkSearch(String source, String dest) {
|
||||
var queryParams = paramFactory.forLinkSearch(source, dest);
|
||||
var queryResponse = queryClient.search(queryParams);
|
||||
|
||||
return getResultsFromQuery(queryResponse);
|
||||
}
|
||||
|
||||
public DecoratedSearchResults doSearch(SearchParameters userParams) throws InterruptedException {
|
||||
// The full user-facing search query does additional work to try to evaluate the query
|
||||
// e.g. as a unit conversion query. This is done in parallel with the regular search.
|
||||
|
||||
Future<String> eval = searchUnitConversionService.tryEval(userParams.query());
|
||||
|
||||
// Perform the regular search
|
||||
|
||||
var queryParams = paramFactory.forRegularSearch(userParams);
|
||||
QueryResponse queryResponse = queryClient.search(queryParams);
|
||||
var queryResults = getResultsFromQuery(queryResponse);
|
||||
|
||||
// Cluster the results based on the query response
|
||||
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
|
||||
.selectStrategy(queryResponse)
|
||||
.clusterResults(queryResults, 25);
|
||||
|
||||
// Log the query and results
|
||||
|
||||
logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
|
||||
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
|
||||
|
||||
// Get the evaluation result and other data to return to the user
|
||||
String evalResult = getFutureOrDefault(eval, "");
|
||||
|
||||
String focusDomain = queryResponse.domain();
|
||||
int focusDomainId = focusDomain == null
|
||||
? -1
|
||||
: domainQueries.tryGetDomainId(new EdgeDomain(focusDomain)).orElse(-1);
|
||||
|
||||
List<String> problems = getProblems(evalResult, queryResults, queryResponse);
|
||||
|
||||
List<DecoratedSearchResults.Page> resultPages = IntStream.rangeClosed(1, queryResponse.totalPages())
|
||||
.mapToObj(number -> new DecoratedSearchResults.Page(
|
||||
number,
|
||||
number == userParams.page(),
|
||||
userParams.withPage(number).renderUrl(websiteUrl)
|
||||
))
|
||||
.toList();
|
||||
|
||||
// Return the results to the user
|
||||
return DecoratedSearchResults.builder()
|
||||
.params(userParams)
|
||||
.problems(problems)
|
||||
.evalResult(evalResult)
|
||||
.results(clusteredResults)
|
||||
.filters(new SearchFilters(websiteUrl, userParams))
|
||||
.focusDomain(focusDomain)
|
||||
.focusDomainId(focusDomainId)
|
||||
.resultPages(resultPages)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
|
||||
final QueryLimits limits = queryResponse.specs().queryLimits;
|
||||
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||
|
||||
// Update the query count (this is what you see on the front page)
|
||||
searchVisitorCount.registerQuery();
|
||||
|
||||
return queryResponse.results().stream()
|
||||
.filter(deduplicator::shouldRetain)
|
||||
.limit(limits.resultsTotal())
|
||||
.map(SearchOperator::createDetails)
|
||||
.toList();
|
||||
}
|
||||
|
||||
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
|
||||
return new UrlDetails(
|
||||
item.documentId(),
|
||||
item.domainId(),
|
||||
cleanUrl(item.url),
|
||||
item.title,
|
||||
item.description,
|
||||
item.format,
|
||||
item.features,
|
||||
DomainIndexingState.ACTIVE,
|
||||
item.rankingScore, // termScore
|
||||
item.resultsFromDomain,
|
||||
BrailleBlockPunchCards.printBits(item.bestPositions, 64),
|
||||
Long.bitCount(item.bestPositions),
|
||||
item.rawIndexResult,
|
||||
item.rawIndexResult.keywordScores
|
||||
);
|
||||
}
|
||||
|
||||
/** Replace nuisance domains with replacements where available */
|
||||
private static EdgeUrl cleanUrl(EdgeUrl url) {
|
||||
String topdomain = url.domain.topDomain;
|
||||
String subdomain = url.domain.subDomain;
|
||||
String path = url.path;
|
||||
|
||||
if (topdomain.equals("fandom.com")) {
|
||||
int wikiIndex = path.indexOf("/wiki/");
|
||||
if (wikiIndex >= 0) {
|
||||
return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null);
|
||||
}
|
||||
}
|
||||
else if (topdomain.equals("medium.com")) {
|
||||
if (!subdomain.isBlank()) {
|
||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
|
||||
}
|
||||
else {
|
||||
String article = path.substring(path.indexOf("/", 1));
|
||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
|
||||
}
|
||||
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) throws InterruptedException {
|
||||
|
||||
// We don't debug the query if it's a site search
|
||||
if (response.domain() == null)
|
||||
return List.of();
|
||||
|
||||
final List<String> problems = new ArrayList<>(response.problems());
|
||||
|
||||
if (queryResults.size() <= 5 && null == evalResult) {
|
||||
problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results.");
|
||||
|
||||
// Try to spell check the search terms
|
||||
var suggestions = getFutureOrDefault(
|
||||
mathClient.spellCheck(response.searchTermsHuman()),
|
||||
Map.of()
|
||||
);
|
||||
|
||||
suggestions.forEach((term, suggestion) -> {
|
||||
if (suggestion.size() > 1) {
|
||||
String suggestionsStr = "\"%s\" could be spelled %s".formatted(term, suggestion.stream().map(s -> "\"" + s + "\"").collect(Collectors.joining(", ")));
|
||||
problems.add(suggestionsStr);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Set<String> representativeKeywords = response.getAllKeywords();
|
||||
if (representativeKeywords.size() > 1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning")))
|
||||
{
|
||||
problems.add("Tip: Try using a query that looks like <tt>define:word</tt> if you want a dictionary definition");
|
||||
}
|
||||
|
||||
return problems;
|
||||
}
|
||||
|
||||
private <T> T getFutureOrDefault(@Nullable Future<T> fut, T defaultValue) {
|
||||
return getFutureOrDefault(fut, Duration.ofMillis(50), defaultValue);
|
||||
}
|
||||
|
||||
private <T> T getFutureOrDefault(@Nullable Future<T> fut, Duration timeout, T defaultValue) {
|
||||
if (fut == null || fut.isCancelled()) {
|
||||
return defaultValue;
|
||||
}
|
||||
try {
|
||||
return fut.get(timeout.toMillis(), TimeUnit.MILLISECONDS);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Error fetching eval result", ex);
|
||||
return defaultValue;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,104 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class SearchQueryParamFactory {
|
||||
|
||||
public QueryParams forRegularSearch(SearchParameters userParams) {
|
||||
SearchQuery prototype = new SearchQuery();
|
||||
var profile = userParams.profile();
|
||||
|
||||
profile.addTacitTerms(prototype);
|
||||
userParams.js().addTacitTerms(prototype);
|
||||
userParams.adtech().addTacitTerms(prototype);
|
||||
|
||||
return new QueryParams(
|
||||
userParams.query(),
|
||||
null,
|
||||
prototype.searchTermsInclude,
|
||||
prototype.searchTermsExclude,
|
||||
prototype.searchTermsPriority,
|
||||
prototype.searchTermsAdvice,
|
||||
profile.getQualityLimit(),
|
||||
profile.getYearLimit(),
|
||||
profile.getSizeLimit(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(5, 100, 200, 8192),
|
||||
profile.searchSetIdentifier.name(),
|
||||
userParams.strategy(),
|
||||
userParams.temporalBias(),
|
||||
userParams.page()
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
public QueryParams forSiteSearch(String domain, int domainId, int count) {
|
||||
return new QueryParams("site:"+domain,
|
||||
null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(domainId),
|
||||
new QueryLimits(count, count, 100, 512),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
public QueryParams forBacklinkSearch(String domain) {
|
||||
return new QueryParams("links:"+domain,
|
||||
null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
public QueryParams forLinkSearch(String sourceDomain, String destDomain) {
|
||||
return new QueryParams("site:" + sourceDomain + " links:" + destDomain,
|
||||
null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
}
|
@@ -0,0 +1,53 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.search.model.ClusteredUrlDetails;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** Functions for clustering search results */
|
||||
public class SearchResultClusterer {
|
||||
private SearchResultClusterer() {}
|
||||
|
||||
public interface SearchResultClusterStrategy {
|
||||
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
|
||||
}
|
||||
|
||||
public static SearchResultClusterStrategy selectStrategy(QueryResponse response) {
|
||||
if (response.domain() != null && !response.domain().isBlank())
|
||||
return SearchResultClusterer::noOp;
|
||||
|
||||
return SearchResultClusterer::byDomain;
|
||||
}
|
||||
|
||||
/** No clustering, just return the results as is */
|
||||
private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
|
||||
if (results.isEmpty())
|
||||
return List.of();
|
||||
|
||||
return results.stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.toList();
|
||||
}
|
||||
|
||||
/** Cluster the results by domain, and return the top "total" clusters
|
||||
* sorted by the relevance of the best result
|
||||
*/
|
||||
private static List<ClusteredUrlDetails> byDomain(List<UrlDetails> results, int total) {
|
||||
if (results.isEmpty())
|
||||
return List.of();
|
||||
|
||||
return results.stream()
|
||||
.collect(
|
||||
Collectors.groupingBy(details -> details.domainId)
|
||||
)
|
||||
.values().stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.sorted()
|
||||
.limit(total)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,128 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.svc.*;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import nu.marginalia.service.server.StaticResources;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Route;
|
||||
import spark.Spark;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class SearchService extends SparkService {
|
||||
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final StaticResources staticResources;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
||||
private static final Histogram wmsa_search_service_request_time = Histogram.build()
|
||||
.name("wmsa_search_service_request_time")
|
||||
.linearBuckets(0.05, 0.05, 15)
|
||||
.labelNames("matchedPath", "method")
|
||||
.help("Search service request time (seconds)")
|
||||
.register();
|
||||
private static final Counter wmsa_search_service_error_count = Counter.build()
|
||||
.name("wmsa_search_service_error_count")
|
||||
.labelNames("matchedPath", "method")
|
||||
.help("Search service error count")
|
||||
.register();
|
||||
|
||||
@Inject
|
||||
public SearchService(BaseServiceParams params,
|
||||
WebsiteUrl websiteUrl,
|
||||
StaticResources staticResources,
|
||||
SearchFrontPageService frontPageService,
|
||||
SearchErrorPageService errorPageService,
|
||||
SearchAddToCrawlQueueService addToCrawlQueueService,
|
||||
SearchSiteInfoService siteInfoService,
|
||||
SearchCrosstalkService crosstalkService,
|
||||
SearchQueryService searchQueryService)
|
||||
throws Exception
|
||||
{
|
||||
super(params);
|
||||
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.staticResources = staticResources;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
SearchServiceMetrics.get("/search", searchQueryService::pathSearch);
|
||||
|
||||
SearchServiceMetrics.get("/", frontPageService::render);
|
||||
SearchServiceMetrics.get("/news.xml", frontPageService::renderNewsFeed);
|
||||
SearchServiceMetrics.get("/:resource", this::serveStatic);
|
||||
|
||||
SearchServiceMetrics.post("/site/suggest/", addToCrawlQueueService::suggestCrawling);
|
||||
|
||||
SearchServiceMetrics.get("/site-search/:site/*", this::siteSearchRedir);
|
||||
|
||||
SearchServiceMetrics.get("/site/:site", siteInfoService::handle);
|
||||
SearchServiceMetrics.post("/site/:site", siteInfoService::handlePost);
|
||||
|
||||
SearchServiceMetrics.get("/crosstalk/", crosstalkService::handle);
|
||||
|
||||
Spark.exception(Exception.class, (e,p,q) -> {
|
||||
logger.error("Error during processing", e);
|
||||
wmsa_search_service_error_count.labels(p.pathInfo(), p.requestMethod()).inc();
|
||||
errorPageService.serveError(p, q);
|
||||
});
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Wraps a route with a timer and a counter */
|
||||
private static class SearchServiceMetrics implements Route {
|
||||
private final Route delegatedRoute;
|
||||
|
||||
static void get(String path, Route route) {
|
||||
Spark.get(path, new SearchServiceMetrics(route));
|
||||
}
|
||||
static void post(String path, Route route) {
|
||||
Spark.post(path, new SearchServiceMetrics(route));
|
||||
}
|
||||
|
||||
private SearchServiceMetrics(Route delegatedRoute) {
|
||||
this.delegatedRoute = delegatedRoute;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object handle(Request request, Response response) throws Exception {
|
||||
return wmsa_search_service_request_time
|
||||
.labels(request.matchedPath(), request.requestMethod())
|
||||
.time(() -> delegatedRoute.handle(request, response));
|
||||
}
|
||||
}
|
||||
|
||||
private Object serveStatic(Request request, Response response) {
|
||||
String resource = request.params("resource");
|
||||
staticResources.serveStatic("search", resource, request, response);
|
||||
return "";
|
||||
}
|
||||
|
||||
private Object siteSearchRedir(Request request, Response response) {
|
||||
final String site = request.params("site");
|
||||
final String searchTerms;
|
||||
|
||||
if (request.splat().length == 0) searchTerms = "";
|
||||
else searchTerms = request.splat()[0];
|
||||
|
||||
final String query = URLEncoder.encode(String.format("%s site:%s", searchTerms, site), StandardCharsets.UTF_8).trim();
|
||||
final String profile = request.queryParamOrDefault("profile", "yolo");
|
||||
|
||||
response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile));
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,43 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.search.command.commands.*;
|
||||
import spark.Response;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class CommandEvaluator {
|
||||
|
||||
private final List<SearchCommandInterface> specialCommands = new ArrayList<>();
|
||||
private final SearchCommand defaultCommand;
|
||||
|
||||
@Inject
|
||||
public CommandEvaluator(
|
||||
BrowseCommand browse,
|
||||
ConvertCommand convert,
|
||||
DefinitionCommand define,
|
||||
BangCommand bang,
|
||||
SiteRedirectCommand siteRedirect,
|
||||
SearchCommand search
|
||||
) {
|
||||
specialCommands.add(browse);
|
||||
specialCommands.add(convert);
|
||||
specialCommands.add(define);
|
||||
specialCommands.add(bang);
|
||||
specialCommands.add(siteRedirect);
|
||||
|
||||
defaultCommand = search;
|
||||
}
|
||||
|
||||
public Object eval(Response response, SearchParameters parameters) {
|
||||
for (var cmd : specialCommands) {
|
||||
var maybe = cmd.process(response, parameters);
|
||||
if (maybe.isPresent())
|
||||
return maybe.get();
|
||||
}
|
||||
|
||||
return defaultCommand.process(response, parameters).orElse("");
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
|
||||
public enum SearchAdtechParameter {
|
||||
DEFAULT("default"),
|
||||
REDUCE("reduce", "special:ads", "special:affiliate");
|
||||
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
|
||||
SearchAdtechParameter(String value, String... implictExcludeSearchTerms) {
|
||||
this.value = value;
|
||||
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
|
||||
}
|
||||
|
||||
public static SearchAdtechParameter parse(@Nullable String value) {
|
||||
if (REDUCE.value.equals(value)) return REDUCE;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||
}
|
||||
}
|
@@ -0,0 +1,10 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
|
||||
import spark.Response;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public interface SearchCommandInterface {
|
||||
Optional<Object> process(Response response, SearchParameters parameters);
|
||||
}
|
@@ -0,0 +1,31 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
|
||||
public enum SearchJsParameter {
|
||||
DEFAULT("default"),
|
||||
DENY_JS("no-js", "js:true"),
|
||||
REQUIRE_JS("yes-js", "js:false");
|
||||
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
|
||||
SearchJsParameter(String value, String... implictExcludeSearchTerms) {
|
||||
this.value = value;
|
||||
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
|
||||
}
|
||||
|
||||
public static SearchJsParameter parse(@Nullable String value) {
|
||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||
}
|
||||
}
|
@@ -0,0 +1,106 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.search.model.SearchProfile;
|
||||
import spark.Request;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
|
||||
import static nu.marginalia.search.command.SearchRecentParameter.RECENT;
|
||||
|
||||
public record SearchParameters(String query,
|
||||
SearchProfile profile,
|
||||
SearchJsParameter js,
|
||||
SearchRecentParameter recent,
|
||||
SearchTitleParameter searchTitle,
|
||||
SearchAdtechParameter adtech,
|
||||
boolean newFilter,
|
||||
int page
|
||||
) {
|
||||
|
||||
public SearchParameters(String queryString, Request request) {
|
||||
this(
|
||||
queryString,
|
||||
SearchProfile.getSearchProfile(request.queryParams("profile")),
|
||||
SearchJsParameter.parse(request.queryParams("js")),
|
||||
SearchRecentParameter.parse(request.queryParams("recent")),
|
||||
SearchTitleParameter.parse(request.queryParams("searchTitle")),
|
||||
SearchAdtechParameter.parse(request.queryParams("adtech")),
|
||||
"true".equals(request.queryParams("newfilter")),
|
||||
Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "1"))
|
||||
);
|
||||
}
|
||||
|
||||
public String profileStr() {
|
||||
return profile.filterId;
|
||||
}
|
||||
|
||||
public SearchParameters withProfile(SearchProfile profile) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
|
||||
}
|
||||
|
||||
public SearchParameters withJs(SearchJsParameter js) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
|
||||
}
|
||||
public SearchParameters withAdtech(SearchAdtechParameter adtech) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
|
||||
}
|
||||
|
||||
public SearchParameters withRecent(SearchRecentParameter recent) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
|
||||
}
|
||||
|
||||
public SearchParameters withTitle(SearchTitleParameter title) {
|
||||
return new SearchParameters(query, profile, js, recent, title, adtech, true, page);
|
||||
}
|
||||
|
||||
public SearchParameters withPage(int page) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, false, page);
|
||||
}
|
||||
|
||||
public String renderUrl(WebsiteUrl baseUrl) {
|
||||
String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d",
|
||||
URLEncoder.encode(query, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(js.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(adtech.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(recent.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8),
|
||||
Boolean.valueOf(newFilter).toString(),
|
||||
page
|
||||
);
|
||||
|
||||
return baseUrl.withPath(path);
|
||||
}
|
||||
|
||||
public ResultRankingParameters.TemporalBias temporalBias() {
|
||||
if (recent == RECENT) {
|
||||
return ResultRankingParameters.TemporalBias.RECENT;
|
||||
}
|
||||
else if (profile == SearchProfile.VINTAGE) {
|
||||
return ResultRankingParameters.TemporalBias.OLD;
|
||||
}
|
||||
|
||||
return ResultRankingParameters.TemporalBias.NONE;
|
||||
}
|
||||
|
||||
public QueryStrategy strategy() {
|
||||
if (searchTitle == SearchTitleParameter.TITLE) {
|
||||
return QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||
}
|
||||
|
||||
return QueryStrategy.AUTO;
|
||||
}
|
||||
|
||||
public SpecificationLimit yearLimit() {
|
||||
if (recent == RECENT)
|
||||
return SpecificationLimit.greaterThan(2018);
|
||||
|
||||
return profile.getYearLimit();
|
||||
}
|
||||
}
|
@@ -0,0 +1,21 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public enum SearchRecentParameter {
|
||||
DEFAULT("default"),
|
||||
RECENT("recent");
|
||||
|
||||
public final String value;
|
||||
|
||||
SearchRecentParameter(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public static SearchRecentParameter parse(@Nullable String value) {
|
||||
if (RECENT.value.equals(value)) return RECENT;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,21 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public enum SearchTitleParameter {
|
||||
DEFAULT("default"),
|
||||
TITLE("title");
|
||||
|
||||
public final String value;
|
||||
|
||||
SearchTitleParameter(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public static SearchTitleParameter parse(@Nullable String value) {
|
||||
if (TITLE.value.equals(value)) return TITLE;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,104 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.exceptions.RedirectException;
|
||||
import spark.Response;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class BangCommand implements SearchCommandInterface {
|
||||
private final Map<String, String> bangsToPattern = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public BangCommand()
|
||||
{
|
||||
bangsToPattern.put("!g", "https://www.google.com/search?q=%s");
|
||||
bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s");
|
||||
bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
|
||||
for (var entry : bangsToPattern.entrySet()) {
|
||||
String bangPattern = entry.getKey();
|
||||
String redirectPattern = entry.getValue();
|
||||
|
||||
var match = matchBangPattern(parameters.query(), bangPattern);
|
||||
|
||||
if (match.isPresent()) {
|
||||
var url = String.format(redirectPattern, URLEncoder.encode(match.get(), StandardCharsets.UTF_8));
|
||||
throw new RedirectException(url);
|
||||
}
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
/** If the query contains the bang pattern bangKey, return the query with the bang pattern removed. */
|
||||
Optional<String> matchBangPattern(String query, String bangKey) {
|
||||
var bm = new BangMatcher(query);
|
||||
|
||||
while (bm.findNext(bangKey)) {
|
||||
|
||||
if (!bm.isRelativeSpaceOrInvalid(-1))
|
||||
continue;
|
||||
if (!bm.isRelativeSpaceOrInvalid(bangKey.length()))
|
||||
continue;
|
||||
|
||||
String prefix = bm.prefix().trim();
|
||||
String suffix = bm.suffix(bangKey.length()).trim();
|
||||
|
||||
String ret = (prefix + " " + suffix).trim();
|
||||
|
||||
return Optional.of(ret)
|
||||
.filter(s -> !s.isBlank());
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
private static class BangMatcher {
|
||||
private final String str;
|
||||
private int pos;
|
||||
|
||||
public String prefix() {
|
||||
return str.substring(0, pos);
|
||||
}
|
||||
|
||||
public String suffix(int offset) {
|
||||
if (pos+offset < str.length())
|
||||
return str.substring(pos + offset);
|
||||
return "";
|
||||
}
|
||||
|
||||
public BangMatcher(String str) {
|
||||
this.str = str;
|
||||
this.pos = -1;
|
||||
}
|
||||
|
||||
public boolean findNext(String pattern) {
|
||||
if (pos + 1 >= str.length())
|
||||
return false;
|
||||
|
||||
return (pos = str.indexOf(pattern, pos + 1)) >= 0;
|
||||
}
|
||||
|
||||
public boolean isRelativeSpaceOrInvalid(int offset) {
|
||||
if (offset + pos < 0)
|
||||
return true;
|
||||
if (offset + pos >= str.length())
|
||||
return true;
|
||||
|
||||
return Character.isSpaceChar(str.charAt(offset + pos));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,36 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.svc.SearchUnitConversionService;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class ConvertCommand implements SearchCommandInterface {
|
||||
private final SearchUnitConversionService searchUnitConversionService;
|
||||
private final MustacheRenderer<Map<String, String>> conversionRenderer;
|
||||
|
||||
@Inject
|
||||
public ConvertCommand(SearchUnitConversionService searchUnitConversionService, RendererFactory rendererFactory) throws IOException {
|
||||
this.searchUnitConversionService = searchUnitConversionService;
|
||||
|
||||
conversionRenderer = rendererFactory.renderer("search/conversion-results");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
var conversion = searchUnitConversionService.tryConversion(parameters.query());
|
||||
return conversion.map(s -> conversionRenderer.render(Map.of(
|
||||
"query", parameters.query(),
|
||||
"result", s,
|
||||
"profile", parameters.profileStr())
|
||||
));
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,70 @@
|
||||
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import nu.marginalia.api.math.model.DictionaryResponse;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DefinitionCommand implements SearchCommandInterface {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final MustacheRenderer<DictionaryResponse> dictionaryRenderer;
|
||||
private final MathClient mathClient;
|
||||
|
||||
|
||||
private final Predicate<String> queryPatternPredicate = Pattern.compile("^define:[A-Za-z\\s-0-9]+$").asPredicate();
|
||||
|
||||
@Inject
|
||||
public DefinitionCommand(RendererFactory rendererFactory, MathClient mathClient)
|
||||
throws IOException
|
||||
{
|
||||
|
||||
dictionaryRenderer = rendererFactory.renderer("search/dictionary-results");
|
||||
this.mathClient = mathClient;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
if (!queryPatternPredicate.test(parameters.query())) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
var results = lookupDefinition(parameters.query());
|
||||
|
||||
return Optional.of(dictionaryRenderer.render(results,
|
||||
Map.of("query", parameters.query(),
|
||||
"profile", parameters.profileStr())
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
private DictionaryResponse lookupDefinition(String humanQuery) {
|
||||
String definePrefix = "define:";
|
||||
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
|
||||
|
||||
try {
|
||||
return mathClient
|
||||
.dictionaryLookup(word)
|
||||
.get(250, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to lookup definition for word: " + word, e);
|
||||
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,39 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
public class SearchCommand implements SearchCommandInterface {
|
||||
private final SearchOperator searchOperator;
|
||||
private final MustacheRenderer<DecoratedSearchResults> searchResultsRenderer;
|
||||
|
||||
|
||||
@Inject
|
||||
public SearchCommand(SearchOperator searchOperator,
|
||||
RendererFactory rendererFactory) throws IOException {
|
||||
this.searchOperator = searchOperator;
|
||||
|
||||
searchResultsRenderer = rendererFactory.renderer("search/search-results");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
try {
|
||||
DecoratedSearchResults results = searchOperator.doSearch(parameters);
|
||||
return Optional.of(searchResultsRenderer.render(results));
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
Thread.currentThread().interrupt();
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,50 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Response;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class SiteRedirectCommand implements SearchCommandInterface {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final Predicate<String> queryPatternPredicate = Pattern.compile("^(site|links):[.A-Za-z\\-0-9]+$").asPredicate();
|
||||
|
||||
@Inject
|
||||
public SiteRedirectCommand() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
if (!queryPatternPredicate.test(parameters.query())) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
int idx = parameters.query().indexOf(':');
|
||||
String prefix = parameters.query().substring(0, idx);
|
||||
String domain = parameters.query().substring(idx + 1).toLowerCase();
|
||||
|
||||
// Use an HTML redirect here, so we can use relative URLs
|
||||
String view = switch (prefix) {
|
||||
case "links" -> "links";
|
||||
default -> "info";
|
||||
};
|
||||
|
||||
return Optional.of("""
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<meta charset="UTF-8">
|
||||
<title>Redirecting...</title>
|
||||
<meta http-equiv="refresh" content="0; url=/site/%s?view=%s">
|
||||
""".formatted(domain, view)
|
||||
);
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,66 @@
|
||||
package nu.marginalia.search.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class DbNearDomainsQuery {
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
public DbNearDomainsQuery(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public List<Integer> getRelatedDomains(String term, Consumer<String> onProblem) {
|
||||
List<Integer> ret = new ArrayList<>();
|
||||
try (var conn = dataSource.getConnection();
|
||||
|
||||
var selfStmt = conn.prepareStatement("""
|
||||
SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT NEIGHBOR_ID, ND.INDEXED, ND.STATE FROM EC_DOMAIN_NEIGHBORS_2
|
||||
INNER JOIN EC_DOMAIN ND ON ND.ID=NEIGHBOR_ID
|
||||
WHERE DOMAIN_ID=?
|
||||
""")) {
|
||||
ResultSet rsp;
|
||||
selfStmt.setString(1, term);
|
||||
rsp = selfStmt.executeQuery();
|
||||
int domainId = -1;
|
||||
if (rsp.next()) {
|
||||
domainId = rsp.getInt(1);
|
||||
ret.add(domainId);
|
||||
}
|
||||
|
||||
stmt.setInt(1, domainId);
|
||||
rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
int indexed = rsp.getInt(2);
|
||||
String state = rsp.getString(3);
|
||||
|
||||
if (indexed > 0 && ("ACTIVE".equalsIgnoreCase(state) || "SOCIAL_MEDIA".equalsIgnoreCase(state) || "SPECIAL".equalsIgnoreCase(state))) {
|
||||
ret.add(id);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
if (ret.isEmpty()) {
|
||||
onProblem.accept("Could not find domains adjacent " + term);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,102 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
||||
* and the rest are additional results, for summary display. */
|
||||
public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
||||
|
||||
@NotNull
|
||||
public final UrlDetails first;
|
||||
|
||||
@NotNull
|
||||
public final List<UrlDetails> rest;
|
||||
|
||||
/** Create a new ClusteredUrlDetails from a collection of UrlDetails,
|
||||
* with the best result as "first", and the others, in descending order
|
||||
* of quality as the "rest"...
|
||||
*
|
||||
* @param details A collection of UrlDetails, which must not be empty.
|
||||
*/
|
||||
public ClusteredUrlDetails(Collection<UrlDetails> details) {
|
||||
var items = new ArrayList<>(details);
|
||||
|
||||
items.sort(Comparator.naturalOrder());
|
||||
|
||||
if (items.isEmpty())
|
||||
throw new IllegalArgumentException("Empty list of details");
|
||||
|
||||
this.first = items.removeFirst();
|
||||
this.rest = items;
|
||||
|
||||
double bestScore = first.termScore;
|
||||
double scoreLimit = Math.min(4.0, bestScore * 1.25);
|
||||
|
||||
this.rest.removeIf(urlDetail -> {
|
||||
if (urlDetail.termScore > scoreLimit)
|
||||
return false;
|
||||
|
||||
for (var keywordScore : urlDetail.resultItem.keywordScores) {
|
||||
if (keywordScore.isKeywordSpecial())
|
||||
continue;
|
||||
if (keywordScore.hasTermFlag(WordFlags.Title))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.ExternalLink))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.UrlDomain))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.UrlPath))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.Subjects))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
||||
this.first = onlyFirst;
|
||||
this.rest = Collections.emptyList();
|
||||
}
|
||||
|
||||
// For renderer use, do not remove
|
||||
public @NotNull UrlDetails getFirst() {
|
||||
return first;
|
||||
}
|
||||
|
||||
// For renderer use, do not remove
|
||||
public @NotNull List<UrlDetails> getRest() {
|
||||
return rest;
|
||||
}
|
||||
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return first.url.getDomain();
|
||||
}
|
||||
|
||||
public boolean hasMultiple() {
|
||||
return !rest.isEmpty();
|
||||
}
|
||||
|
||||
/** Returns the total number of results from the same domain,
|
||||
* including such results that are not included here. */
|
||||
public int totalCount() {
|
||||
return first.resultsFromSameDomain;
|
||||
}
|
||||
|
||||
public int remainingCount() {
|
||||
return totalCount() - 1 - rest.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull ClusteredUrlDetails o) {
|
||||
return Objects.compare(first, o.first, UrlDetails::compareTo);
|
||||
}
|
||||
}
|
@@ -0,0 +1,186 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A class to hold details about the search results,
|
||||
* as used by the handlebars templating engine to render
|
||||
* the search results page.
|
||||
*/
|
||||
public class DecoratedSearchResults {
|
||||
private final SearchParameters params;
|
||||
private final List<String> problems;
|
||||
private final String evalResult;
|
||||
|
||||
public DecoratedSearchResults(SearchParameters params,
|
||||
List<String> problems,
|
||||
String evalResult,
|
||||
List<ClusteredUrlDetails> results,
|
||||
String focusDomain,
|
||||
int focusDomainId,
|
||||
SearchFilters filters,
|
||||
List<Page> resultPages) {
|
||||
this.params = params;
|
||||
this.problems = problems;
|
||||
this.evalResult = evalResult;
|
||||
this.results = results;
|
||||
this.focusDomain = focusDomain;
|
||||
this.focusDomainId = focusDomainId;
|
||||
this.filters = filters;
|
||||
this.resultPages = resultPages;
|
||||
}
|
||||
|
||||
public final List<ClusteredUrlDetails> results;
|
||||
|
||||
public static DecoratedSearchResultsBuilder builder() {
|
||||
return new DecoratedSearchResultsBuilder();
|
||||
}
|
||||
|
||||
public SearchParameters getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public List<String> getProblems() {
|
||||
return problems;
|
||||
}
|
||||
|
||||
public String getEvalResult() {
|
||||
return evalResult;
|
||||
}
|
||||
|
||||
public List<ClusteredUrlDetails> getResults() {
|
||||
return results;
|
||||
}
|
||||
|
||||
public String getFocusDomain() {
|
||||
return focusDomain;
|
||||
}
|
||||
|
||||
public int getFocusDomainId() {
|
||||
return focusDomainId;
|
||||
}
|
||||
|
||||
public SearchFilters getFilters() {
|
||||
return filters;
|
||||
}
|
||||
|
||||
public List<Page> getResultPages() {
|
||||
return resultPages;
|
||||
}
|
||||
|
||||
private final String focusDomain;
|
||||
private final int focusDomainId;
|
||||
private final SearchFilters filters;
|
||||
|
||||
private final List<Page> resultPages;
|
||||
|
||||
public boolean isMultipage() {
|
||||
return resultPages.size() > 1;
|
||||
}
|
||||
|
||||
public record Page(int number, boolean current, String href) {
|
||||
}
|
||||
|
||||
// These are used by the search form, they look unused in the IDE but are used by the mustache template,
|
||||
// DO NOT REMOVE THEM
|
||||
public int getResultCount() {
|
||||
return results.size();
|
||||
}
|
||||
|
||||
public String getQuery() {
|
||||
return params.query();
|
||||
}
|
||||
|
||||
public String getProfile() {
|
||||
return params.profile().filterId;
|
||||
}
|
||||
|
||||
public String getJs() {
|
||||
return params.js().value;
|
||||
}
|
||||
|
||||
public String getAdtech() {
|
||||
return params.adtech().value;
|
||||
}
|
||||
|
||||
public String getRecent() {
|
||||
return params.recent().value;
|
||||
}
|
||||
|
||||
public String getSearchTitle() {
|
||||
return params.searchTitle().value;
|
||||
}
|
||||
|
||||
public int page() {
|
||||
return params.page();
|
||||
}
|
||||
|
||||
public Boolean isNewFilter() {
|
||||
return params.newFilter();
|
||||
}
|
||||
|
||||
|
||||
public static class DecoratedSearchResultsBuilder {
|
||||
private SearchParameters params;
|
||||
private List<String> problems;
|
||||
private String evalResult;
|
||||
private List<ClusteredUrlDetails> results;
|
||||
private String focusDomain;
|
||||
private int focusDomainId;
|
||||
private SearchFilters filters;
|
||||
private List<Page> resultPages;
|
||||
|
||||
DecoratedSearchResultsBuilder() {
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder params(SearchParameters params) {
|
||||
this.params = params;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder problems(List<String> problems) {
|
||||
this.problems = problems;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder evalResult(String evalResult) {
|
||||
this.evalResult = evalResult;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder results(List<ClusteredUrlDetails> results) {
|
||||
this.results = results;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder focusDomain(String focusDomain) {
|
||||
this.focusDomain = focusDomain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder focusDomainId(int focusDomainId) {
|
||||
this.focusDomainId = focusDomainId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder filters(SearchFilters filters) {
|
||||
this.filters = filters;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder resultPages(List<Page> resultPages) {
|
||||
this.resultPages = resultPages;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResults build() {
|
||||
return new DecoratedSearchResults(this.params, this.problems, this.evalResult, this.results, this.focusDomain, this.focusDomainId, this.filters, this.resultPages);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DecoratedSearchResults.DecoratedSearchResultsBuilder(params=" + this.params + ", problems=" + this.problems + ", evalResult=" + this.evalResult + ", results=" + this.results + ", focusDomain=" + this.focusDomain + ", focusDomainId=" + this.focusDomainId + ", filters=" + this.filters + ", resultPages=" + this.resultPages + ")";
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,223 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.command.*;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/** Models the search filters displayed next to the search results */
|
||||
public class SearchFilters {
|
||||
private final WebsiteUrl url;
|
||||
|
||||
public final String currentFilter;
|
||||
|
||||
// These are necessary for the renderer to access the data
|
||||
public final RemoveJsOption removeJsOption;
|
||||
public final ReduceAdtechOption reduceAdtechOption;
|
||||
public final ShowRecentOption showRecentOption;
|
||||
public final SearchTitleOption searchTitleOption;
|
||||
|
||||
public final List<List<Filter>> filterGroups;
|
||||
|
||||
// Getters are for the renderer to access the data
|
||||
|
||||
|
||||
public String getCurrentFilter() {
|
||||
return currentFilter;
|
||||
}
|
||||
|
||||
public RemoveJsOption getRemoveJsOption() {
|
||||
return removeJsOption;
|
||||
}
|
||||
|
||||
public ReduceAdtechOption getReduceAdtechOption() {
|
||||
return reduceAdtechOption;
|
||||
}
|
||||
|
||||
public ShowRecentOption getShowRecentOption() {
|
||||
return showRecentOption;
|
||||
}
|
||||
|
||||
public SearchTitleOption getSearchTitleOption() {
|
||||
return searchTitleOption;
|
||||
}
|
||||
|
||||
public List<List<Filter>> getFilterGroups() {
|
||||
return filterGroups;
|
||||
}
|
||||
|
||||
public SearchFilters(WebsiteUrl url, SearchParameters parameters) {
|
||||
this.url = url;
|
||||
|
||||
removeJsOption = new RemoveJsOption(parameters);
|
||||
reduceAdtechOption = new ReduceAdtechOption(parameters);
|
||||
showRecentOption = new ShowRecentOption(parameters);
|
||||
searchTitleOption = new SearchTitleOption(parameters);
|
||||
|
||||
|
||||
currentFilter = parameters.profile().filterId;
|
||||
|
||||
filterGroups = List.of(
|
||||
List.of(
|
||||
new Filter("No Filter", SearchProfile.NO_FILTER, parameters),
|
||||
// new Filter("Popular", SearchProfile.POPULAR, parameters),
|
||||
new Filter("Small Web", SearchProfile.SMALLWEB, parameters),
|
||||
new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters),
|
||||
new Filter("Academia", SearchProfile.ACADEMIA, parameters)
|
||||
),
|
||||
List.of(
|
||||
new Filter("Vintage", SearchProfile.VINTAGE, parameters),
|
||||
new Filter("Plain Text", SearchProfile.PLAIN_TEXT, parameters),
|
||||
new Filter("~tilde", SearchProfile.TILDE, parameters)
|
||||
),
|
||||
List.of(
|
||||
new Filter("Wiki", SearchProfile.WIKI, parameters),
|
||||
new Filter("Forum", SearchProfile.FORUM, parameters),
|
||||
new Filter("Docs", SearchProfile.DOCS, parameters),
|
||||
new Filter("Recipes", SearchProfile.FOOD, parameters)
|
||||
)
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
public class RemoveJsOption {
|
||||
private final SearchJsParameter value;
|
||||
|
||||
public final String url;
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return value.equals(SearchJsParameter.DENY_JS);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return "Remove Javascript";
|
||||
}
|
||||
|
||||
public RemoveJsOption(SearchParameters parameters) {
|
||||
this.value = parameters.js();
|
||||
|
||||
var toggledValue = switch (parameters.js()) {
|
||||
case DENY_JS -> SearchJsParameter.DEFAULT;
|
||||
default -> SearchJsParameter.DENY_JS;
|
||||
};
|
||||
|
||||
this.url = parameters.withJs(toggledValue).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
}
|
||||
|
||||
public class ReduceAdtechOption {
|
||||
private final SearchAdtechParameter value;
|
||||
|
||||
public final String url;
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return value.equals(SearchAdtechParameter.REDUCE);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return "Reduce Adtech";
|
||||
}
|
||||
|
||||
public ReduceAdtechOption(SearchParameters parameters) {
|
||||
this.value = parameters.adtech();
|
||||
|
||||
var toggledValue = switch (parameters.adtech()) {
|
||||
case REDUCE -> SearchAdtechParameter.DEFAULT;
|
||||
default -> SearchAdtechParameter.REDUCE;
|
||||
};
|
||||
|
||||
this.url = parameters.withAdtech(toggledValue).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
}
|
||||
|
||||
public class ShowRecentOption {
|
||||
private final SearchRecentParameter value;
|
||||
|
||||
public final String url;
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return value.equals(SearchRecentParameter.RECENT);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return "Recent Results";
|
||||
}
|
||||
|
||||
public ShowRecentOption(SearchParameters parameters) {
|
||||
this.value = parameters.recent();
|
||||
|
||||
var toggledValue = switch (parameters.recent()) {
|
||||
case RECENT -> SearchRecentParameter.DEFAULT;
|
||||
default -> SearchRecentParameter.RECENT;
|
||||
};
|
||||
|
||||
this.url = parameters.withRecent(toggledValue).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
}
|
||||
|
||||
public class SearchTitleOption {
|
||||
private final SearchTitleParameter value;
|
||||
|
||||
public final String url;
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return value.equals(SearchTitleParameter.TITLE);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return "Search In Title";
|
||||
}
|
||||
|
||||
public SearchTitleOption(SearchParameters parameters) {
|
||||
this.value = parameters.searchTitle();
|
||||
|
||||
var toggledValue = switch (parameters.searchTitle()) {
|
||||
case TITLE -> SearchTitleParameter.DEFAULT;
|
||||
default -> SearchTitleParameter.TITLE;
|
||||
};
|
||||
|
||||
this.url = parameters.withTitle(toggledValue).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
}
|
||||
|
||||
public class Filter {
|
||||
public final SearchProfile profile;
|
||||
|
||||
public final String displayName;
|
||||
public final boolean current;
|
||||
public final String url;
|
||||
|
||||
public Filter(String displayName, SearchProfile profile, SearchParameters parameters) {
|
||||
this.displayName = displayName;
|
||||
this.profile = profile;
|
||||
this.current = profile.equals(parameters.profile());
|
||||
|
||||
this.url = parameters.withProfile(profile).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
|
||||
public String getDisplayName() {
|
||||
return displayName;
|
||||
}
|
||||
|
||||
public boolean isCurrent() {
|
||||
return current;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,105 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public enum SearchProfile {
|
||||
POPULAR("default", SearchSetIdentifier.POPULAR),
|
||||
SMALLWEB("modern", SearchSetIdentifier.SMALLWEB),
|
||||
BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS),
|
||||
NO_FILTER("corpo", SearchSetIdentifier.NONE),
|
||||
VINTAGE("vintage", SearchSetIdentifier.NONE),
|
||||
TILDE("tilde", SearchSetIdentifier.NONE),
|
||||
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
|
||||
ACADEMIA("academia", SearchSetIdentifier.NONE),
|
||||
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
|
||||
FOOD("food", SearchSetIdentifier.POPULAR),
|
||||
FORUM("forum", SearchSetIdentifier.NONE),
|
||||
WIKI("wiki", SearchSetIdentifier.NONE),
|
||||
DOCS("docs", SearchSetIdentifier.NONE),
|
||||
;
|
||||
|
||||
|
||||
public final String filterId;
|
||||
public final SearchSetIdentifier searchSetIdentifier;
|
||||
|
||||
SearchProfile(String filterId, SearchSetIdentifier searchSetIdentifier) {
|
||||
this.filterId = filterId;
|
||||
this.searchSetIdentifier = searchSetIdentifier;
|
||||
}
|
||||
|
||||
private final static SearchProfile[] values = values();
|
||||
public static SearchProfile getSearchProfile(String param) {
|
||||
if (null == param) {
|
||||
return NO_FILTER;
|
||||
}
|
||||
|
||||
for (var profile : values) {
|
||||
if (Objects.equals(profile.filterId, param)) {
|
||||
return profile;
|
||||
}
|
||||
}
|
||||
|
||||
return NO_FILTER;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
if (this == ACADEMIA) {
|
||||
subquery.searchTermsAdvice.add("special:academia");
|
||||
}
|
||||
if (this == VINTAGE) {
|
||||
subquery.searchTermsPriority.add("format:html123");
|
||||
subquery.searchTermsPriority.add("js:false");
|
||||
}
|
||||
if (this == TILDE) {
|
||||
subquery.searchTermsAdvice.add("special:tilde");
|
||||
}
|
||||
if (this == PLAIN_TEXT) {
|
||||
subquery.searchTermsAdvice.add("format:plain");
|
||||
}
|
||||
if (this == WIKI) {
|
||||
subquery.searchTermsAdvice.add("generator:wiki");
|
||||
}
|
||||
if (this == FORUM) {
|
||||
subquery.searchTermsAdvice.add("generator:forum");
|
||||
}
|
||||
if (this == DOCS) {
|
||||
subquery.searchTermsAdvice.add("generator:docs");
|
||||
}
|
||||
if (this == FOOD) {
|
||||
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
||||
subquery.searchTermsExclude.add("special:ads");
|
||||
}
|
||||
}
|
||||
|
||||
public SpecificationLimit getYearLimit() {
|
||||
if (this == SMALLWEB) {
|
||||
return SpecificationLimit.greaterThan(2015);
|
||||
}
|
||||
if (this == VINTAGE) {
|
||||
return SpecificationLimit.lessThan(2003);
|
||||
}
|
||||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
public SpecificationLimit getSizeLimit() {
|
||||
if (this == SMALLWEB) {
|
||||
return SpecificationLimit.lessThan(500);
|
||||
}
|
||||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
|
||||
public SpecificationLimit getQualityLimit() {
|
||||
if (this == SMALLWEB) {
|
||||
return SpecificationLimit.lessThan(5);
|
||||
}
|
||||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,293 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A class to hold details about a single search result.
|
||||
*/
|
||||
public class UrlDetails implements Comparable<UrlDetails> {
|
||||
public long id;
|
||||
public int domainId;
|
||||
|
||||
public EdgeUrl url;
|
||||
public String title;
|
||||
public String description;
|
||||
|
||||
public String format;
|
||||
public int features;
|
||||
|
||||
public DomainIndexingState domainState;
|
||||
|
||||
public double termScore;
|
||||
|
||||
public int resultsFromSameDomain;
|
||||
|
||||
public String positions;
|
||||
public int positionsCount;
|
||||
public SearchResultItem resultItem;
|
||||
public List<SearchResultKeywordScore> keywordScores;
|
||||
|
||||
public UrlDetails(long id, int domainId, EdgeUrl url, String title, String description, String format, int features, DomainIndexingState domainState, double termScore, int resultsFromSameDomain, String positions, int positionsCount, SearchResultItem resultItem, List<SearchResultKeywordScore> keywordScores) {
|
||||
this.id = id;
|
||||
this.domainId = domainId;
|
||||
this.url = url;
|
||||
this.title = title;
|
||||
this.description = description;
|
||||
this.format = format;
|
||||
this.features = features;
|
||||
this.domainState = domainState;
|
||||
this.termScore = termScore;
|
||||
this.resultsFromSameDomain = resultsFromSameDomain;
|
||||
this.positions = positions;
|
||||
this.positionsCount = positionsCount;
|
||||
this.resultItem = resultItem;
|
||||
this.keywordScores = keywordScores;
|
||||
}
|
||||
|
||||
public UrlDetails() {
|
||||
}
|
||||
|
||||
public boolean hasMoreResults() {
|
||||
return resultsFromSameDomain > 1;
|
||||
}
|
||||
|
||||
public String getFormat() {
|
||||
if (null == format) {
|
||||
return "?";
|
||||
}
|
||||
switch (format) {
|
||||
case "HTML123":
|
||||
return "HTML 1-3";
|
||||
case "HTML4":
|
||||
return "HTML 4";
|
||||
case "XHTML":
|
||||
return "XHTML";
|
||||
case "HTML5":
|
||||
return "HTML 5";
|
||||
case "PLAIN":
|
||||
return "Plain Text";
|
||||
default:
|
||||
return "?";
|
||||
}
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return Long.hashCode(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(UrlDetails other) {
|
||||
int result = Double.compare(getTermScore(), other.getTermScore());
|
||||
if (result == 0) result = Long.compare(getId(), other.getId());
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == null) {
|
||||
return false;
|
||||
}
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
if (other instanceof UrlDetails) {
|
||||
return ((UrlDetails) other).id == id;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
if (title == null || title.isBlank()) {
|
||||
return url.toString();
|
||||
}
|
||||
return title;
|
||||
}
|
||||
|
||||
public boolean isPlainText() {
|
||||
return "PLAIN".equals(format);
|
||||
}
|
||||
|
||||
public int getProblemCount() {
|
||||
int mask = HtmlFeature.JS.getFeatureBit()
|
||||
| HtmlFeature.COOKIES.getFeatureBit()
|
||||
| HtmlFeature.TRACKING.getFeatureBit()
|
||||
| HtmlFeature.AFFILIATE_LINK.getFeatureBit()
|
||||
| HtmlFeature.TRACKING_ADTECH.getFeatureBit()
|
||||
| HtmlFeature.ADVERTISEMENT.getFeatureBit();
|
||||
|
||||
return Integer.bitCount(features & mask);
|
||||
}
|
||||
|
||||
public List<UrlProblem> getProblems() {
|
||||
List<UrlProblem> problems = new ArrayList<>();
|
||||
|
||||
if (isScripts()) {
|
||||
problems.add(new UrlProblem("Js", "The page uses Javascript"));
|
||||
}
|
||||
if (isCookies()) {
|
||||
problems.add(new UrlProblem("Co", "The page uses Cookies"));
|
||||
}
|
||||
if (isTracking()) {
|
||||
problems.add(new UrlProblem("Tr", "The page uses Tracking/Analytics"));
|
||||
}
|
||||
if (isAffiliate()) {
|
||||
problems.add(new UrlProblem("Af", "The page may use Affiliate Linking"));
|
||||
}
|
||||
if (isAds()) {
|
||||
problems.add(new UrlProblem("Ad", "The page uses Ads/Adtech Tracking"));
|
||||
}
|
||||
return problems;
|
||||
|
||||
}
|
||||
|
||||
public boolean isScripts() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
|
||||
}
|
||||
|
||||
public boolean isTracking() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
|
||||
}
|
||||
|
||||
public boolean isAffiliate() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
|
||||
}
|
||||
|
||||
public boolean isMedia() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.MEDIA);
|
||||
}
|
||||
|
||||
public boolean isCookies() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
public boolean isAds() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH);
|
||||
}
|
||||
|
||||
public int getMatchRank() {
|
||||
if (termScore <= 1) return 1;
|
||||
if (termScore <= 2) return 2;
|
||||
if (termScore <= 3) return 3;
|
||||
if (termScore <= 5) return 5;
|
||||
|
||||
return 10;
|
||||
}
|
||||
|
||||
public long getId() {
|
||||
return this.id;
|
||||
}
|
||||
|
||||
public int getDomainId() {
|
||||
return this.domainId;
|
||||
}
|
||||
|
||||
public EdgeUrl getUrl() {
|
||||
return this.url;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return this.description;
|
||||
}
|
||||
|
||||
public int getFeatures() {
|
||||
return this.features;
|
||||
}
|
||||
|
||||
public DomainIndexingState getDomainState() {
|
||||
return this.domainState;
|
||||
}
|
||||
|
||||
public double getTermScore() {
|
||||
return this.termScore;
|
||||
}
|
||||
|
||||
public int getResultsFromSameDomain() {
|
||||
return this.resultsFromSameDomain;
|
||||
}
|
||||
|
||||
public String getPositions() {
|
||||
return this.positions;
|
||||
}
|
||||
|
||||
public int getPositionsCount() {
|
||||
return this.positionsCount;
|
||||
}
|
||||
|
||||
public SearchResultItem getResultItem() {
|
||||
return this.resultItem;
|
||||
}
|
||||
|
||||
public List<SearchResultKeywordScore> getKeywordScores() {
|
||||
return this.keywordScores;
|
||||
}
|
||||
|
||||
public UrlDetails withId(long id) {
|
||||
return this.id == id ? this : new UrlDetails(id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withDomainId(int domainId) {
|
||||
return this.domainId == domainId ? this : new UrlDetails(this.id, domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withUrl(EdgeUrl url) {
|
||||
return this.url == url ? this : new UrlDetails(this.id, this.domainId, url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withTitle(String title) {
|
||||
return this.title == title ? this : new UrlDetails(this.id, this.domainId, this.url, title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withDescription(String description) {
|
||||
return this.description == description ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withFormat(String format) {
|
||||
return this.format == format ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withFeatures(int features) {
|
||||
return this.features == features ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withDomainState(DomainIndexingState domainState) {
|
||||
return this.domainState == domainState ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withTermScore(double termScore) {
|
||||
return this.termScore == termScore ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withResultsFromSameDomain(int resultsFromSameDomain) {
|
||||
return this.resultsFromSameDomain == resultsFromSameDomain ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withPositions(String positions) {
|
||||
return this.positions == positions ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withPositionsCount(int positionsCount) {
|
||||
return this.positionsCount == positionsCount ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withResultItem(SearchResultItem resultItem) {
|
||||
return this.resultItem == resultItem ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withKeywordScores(List<SearchResultKeywordScore> keywordScores) {
|
||||
return this.keywordScores == keywordScores ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, keywordScores);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "UrlDetails(id=" + this.getId() + ", domainId=" + this.getDomainId() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", domainState=" + this.getDomainState() + ", termScore=" + this.getTermScore() + ", resultsFromSameDomain=" + this.getResultsFromSameDomain() + ", positions=" + this.getPositions() + ", positionsCount=" + this.getPositionsCount() + ", resultItem=" + this.getResultItem() + ", keywordScores=" + this.getKeywordScores() + ")";
|
||||
}
|
||||
|
||||
public static record UrlProblem(String name, String description) {
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,27 @@
|
||||
package nu.marginalia.search.results;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
@Singleton
|
||||
public class BrowseResultCleaner {
|
||||
private final ScreenshotService screenshotService;
|
||||
|
||||
@Inject
|
||||
public BrowseResultCleaner(ScreenshotService screenshotService) {
|
||||
this.screenshotService = screenshotService;
|
||||
}
|
||||
|
||||
public Predicate<BrowseResult> shouldRemoveResultPredicateBr() {
|
||||
Set<String> domainHashes = new HashSet<>(100);
|
||||
|
||||
return (res) -> !screenshotService.hasScreenshot(res.domainId())
|
||||
|| !domainHashes.add(res.domainHash());
|
||||
}
|
||||
}
|
@@ -0,0 +1,69 @@
|
||||
package nu.marginalia.search.results;
|
||||
|
||||
import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.lsh.EasyLSH;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class UrlDeduplicator {
|
||||
private final int LSH_SIMILARITY_THRESHOLD = 2;
|
||||
private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class);
|
||||
|
||||
private final TIntHashSet seenSuperficialhashes = new TIntHashSet(200);
|
||||
private final TLongList seehLSHList = new TLongArrayList(200);
|
||||
private final TObjectIntHashMap<String> keyCount = new TObjectIntHashMap<>(200, 0.75f, 0);
|
||||
|
||||
private final int resultsPerKey;
|
||||
public UrlDeduplicator(int resultsPerKey) {
|
||||
this.resultsPerKey = resultsPerKey;
|
||||
}
|
||||
|
||||
public boolean shouldRemove(DecoratedSearchResultItem details) {
|
||||
if (!deduplicateOnSuperficialHash(details))
|
||||
return true;
|
||||
if (!deduplicateOnLSH(details))
|
||||
return true;
|
||||
if (!limitResultsPerDomain(details))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean shouldRetain(DecoratedSearchResultItem details) {
|
||||
return !shouldRemove(details);
|
||||
}
|
||||
|
||||
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
|
||||
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
|
||||
}
|
||||
|
||||
private boolean deduplicateOnLSH(DecoratedSearchResultItem details) {
|
||||
long thisHash = details.dataHash;
|
||||
|
||||
if (0 == thisHash)
|
||||
return true;
|
||||
|
||||
if (seehLSHList.forEach(otherHash -> EasyLSH.hammingDistance(thisHash, otherHash) >= LSH_SIMILARITY_THRESHOLD))
|
||||
{
|
||||
seehLSHList.add(thisHash);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
||||
final var domain = details.getUrl().getDomain();
|
||||
final String key = domain.getDomainKey();
|
||||
|
||||
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,69 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class SearchAddToCrawlQueueService {
|
||||
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(SearchAddToCrawlQueueService.class);
|
||||
|
||||
@Inject
|
||||
public SearchAddToCrawlQueueService(DbDomainQueries domainQueries,
|
||||
WebsiteUrl websiteUrl,
|
||||
HikariDataSource dataSource) {
|
||||
this.domainQueries = domainQueries;
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public Object suggestCrawling(Request request, Response response) throws SQLException {
|
||||
logger.info("{}", request.queryParams());
|
||||
int id = Integer.parseInt(request.queryParams("id"));
|
||||
boolean nomisclick = "on".equals(request.queryParams("nomisclick"));
|
||||
|
||||
String domainName = getDomainName(id);
|
||||
|
||||
if (nomisclick) {
|
||||
logger.info("Adding {} to crawl queue", domainName);
|
||||
addToCrawlQueue(id);
|
||||
}
|
||||
else {
|
||||
logger.info("Nomisclick not set, not adding {} to crawl queue", domainName);
|
||||
}
|
||||
|
||||
response.redirect(websiteUrl.withPath("/site/" + domainName));
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private void addToCrawlQueue(int id) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
|
||||
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
|
||||
""")) {
|
||||
stmt.setInt(1, id);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
private String getDomainName(int id) {
|
||||
var domain = domainQueries.getDomain(id);
|
||||
if (domain.isEmpty())
|
||||
Spark.halt(404);
|
||||
return domain.get().toString();
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,87 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.browse.DbBrowseDomainsRandom;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.browse.model.BrowseResultSet;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.search.results.BrowseResultCleaner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import static java.util.Collections.shuffle;
|
||||
|
||||
public class SearchBrowseService {
|
||||
private final DbBrowseDomainsRandom randomDomains;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final DomainBlacklist blacklist;
|
||||
private final DomainInfoClient domainInfoClient;
|
||||
private final BrowseResultCleaner browseResultCleaner;
|
||||
|
||||
@Inject
|
||||
public SearchBrowseService(DbBrowseDomainsRandom randomDomains,
|
||||
DbDomainQueries domainQueries,
|
||||
DomainBlacklist blacklist,
|
||||
DomainInfoClient domainInfoClient,
|
||||
BrowseResultCleaner browseResultCleaner)
|
||||
{
|
||||
this.randomDomains = randomDomains;
|
||||
this.domainQueries = domainQueries;
|
||||
this.blacklist = blacklist;
|
||||
this.domainInfoClient = domainInfoClient;
|
||||
this.browseResultCleaner = browseResultCleaner;
|
||||
}
|
||||
|
||||
public BrowseResultSet getRandomEntries(int set) {
|
||||
List<BrowseResult> results = randomDomains.getRandomDomains(25, blacklist, set);
|
||||
|
||||
results.removeIf(browseResultCleaner.shouldRemoveResultPredicateBr());
|
||||
|
||||
return new BrowseResultSet(results);
|
||||
}
|
||||
|
||||
public BrowseResultSet getRelatedEntries(String domainName) throws ExecutionException, InterruptedException, TimeoutException {
|
||||
var domain = domainQueries.getDomainId(new EdgeDomain(domainName));
|
||||
|
||||
var neighbors = domainInfoClient.similarDomains(domain, 50)
|
||||
.get(100, TimeUnit.MILLISECONDS);
|
||||
|
||||
neighbors.removeIf(sd -> !sd.screenshot());
|
||||
|
||||
// If the results are very few, supplement with the alternative shitty algorithm
|
||||
if (neighbors.size() < 25) {
|
||||
Set<SimilarDomain> allNeighbors = new HashSet<>(neighbors);
|
||||
allNeighbors.addAll(domainInfoClient
|
||||
.linkedDomains(domain, 50)
|
||||
.get(100, TimeUnit.MILLISECONDS)
|
||||
);
|
||||
|
||||
neighbors.clear();
|
||||
neighbors.addAll(allNeighbors);
|
||||
neighbors.removeIf(sd -> !sd.screenshot());
|
||||
}
|
||||
|
||||
List<BrowseResult> results = new ArrayList<>(neighbors.size());
|
||||
for (SimilarDomain sd : neighbors) {
|
||||
var resultDomain = domainQueries.getDomain(sd.domainId());
|
||||
if (resultDomain.isEmpty())
|
||||
continue;
|
||||
|
||||
results.add(new BrowseResult(resultDomain.get().toRootUrlHttp(), sd.domainId(), 0, sd.screenshot()));
|
||||
}
|
||||
// shuffle the items for a less repetitive experience
|
||||
shuffle(neighbors);
|
||||
|
||||
return new BrowseResultSet(results, domainName);
|
||||
}
|
||||
}
|
@@ -0,0 +1,69 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
public class SearchCrosstalkService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchCrosstalkService.class);
|
||||
private final SearchOperator searchOperator;
|
||||
private final MustacheRenderer<CrosstalkResult> renderer;
|
||||
|
||||
@Inject
|
||||
public SearchCrosstalkService(SearchOperator searchOperator,
|
||||
RendererFactory rendererFactory) throws IOException
|
||||
{
|
||||
this.searchOperator = searchOperator;
|
||||
this.renderer = rendererFactory.renderer("search/site-info/site-crosstalk");
|
||||
}
|
||||
|
||||
public Object handle(Request request, Response response) throws SQLException {
|
||||
String domains = request.queryParams("domains");
|
||||
String[] parts = StringUtils.split(domains, ',');
|
||||
|
||||
if (parts.length != 2) {
|
||||
throw new IllegalArgumentException("Expected exactly two domains");
|
||||
}
|
||||
|
||||
response.type("text/html");
|
||||
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
parts[i] = parts[i].trim();
|
||||
}
|
||||
|
||||
var resAtoB = searchOperator.doLinkSearch(parts[0], parts[1]);
|
||||
var resBtoA = searchOperator.doLinkSearch(parts[1], parts[0]);
|
||||
|
||||
var model = new CrosstalkResult(parts[0], parts[1], resAtoB, resBtoA);
|
||||
|
||||
return renderer.render(model);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private record CrosstalkResult(String domainA,
|
||||
String domainB,
|
||||
List<UrlDetails> forward,
|
||||
List<UrlDetails> backward)
|
||||
{
|
||||
|
||||
public boolean isFocusDomain() {
|
||||
return true; // Hack to get the search result templates behave well
|
||||
}
|
||||
public boolean hasBoth() {
|
||||
return !forward.isEmpty() && !backward.isEmpty();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,47 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.index.api.IndexMqClient;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
public class SearchErrorPageService {
|
||||
private final IndexMqClient indexMqClient;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
|
||||
@Inject
|
||||
public SearchErrorPageService(IndexMqClient indexMqClient,
|
||||
RendererFactory rendererFactory) throws IOException {
|
||||
|
||||
renderer = rendererFactory.renderer("search/error-page-search");
|
||||
|
||||
this.indexMqClient = indexMqClient;
|
||||
}
|
||||
|
||||
public void serveError(Request request, Response rsp) {
|
||||
rsp.body(renderError(request, "Internal error",
|
||||
"""
|
||||
An error occurred when communicating with the search engine index.
|
||||
<p>
|
||||
This is hopefully a temporary state of affairs. It may be due to
|
||||
an upgrade. The index typically takes a about two or three minutes
|
||||
to reload from a cold restart. Thanks for your patience.
|
||||
"""));
|
||||
}
|
||||
|
||||
private String renderError(Request request, String title, String message) {
|
||||
return renderer.render(Map.of("title", title, "message", message,
|
||||
"profile", request.queryParamOrDefault("profile", ""),
|
||||
"js", request.queryParamOrDefault("js", ""),
|
||||
"query", request.queryParamOrDefault("query", "")
|
||||
));
|
||||
}
|
||||
}
|
@@ -0,0 +1,85 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** Service for handling flagging sites. This code has an admin-facing correspondent in
|
||||
* DomainComplaintService in control-service
|
||||
*/
|
||||
public class SearchFlagSiteService {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final CategoryItem unknownCategory = new CategoryItem("unknown", "Unknown");
|
||||
|
||||
private final List<CategoryItem> categories =
|
||||
List.of(
|
||||
new CategoryItem("spam", "Spam"),
|
||||
new CategoryItem("freebooting", "Reposting Stolen Content"),
|
||||
new CategoryItem("broken", "Broken Website"),
|
||||
new CategoryItem("shock", "Shocking/Offensive"),
|
||||
new CategoryItem("blacklist", "Review Blacklisting"),
|
||||
new CategoryItem("no-random", "Remove from Random Exploration")
|
||||
);
|
||||
|
||||
private final Map<String, CategoryItem> categoryItemMap =
|
||||
categories.stream().collect(Collectors.toMap(CategoryItem::categoryName, Function.identity()));
|
||||
@Inject
|
||||
public SearchFlagSiteService(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public List<CategoryItem> getCategories() {
|
||||
return categories;
|
||||
}
|
||||
|
||||
public List<FlagSiteComplaintModel> getExistingComplaints(int id) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var complaintsStmt = conn.prepareStatement("""
|
||||
SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION
|
||||
FROM DOMAIN_COMPLAINT
|
||||
WHERE DOMAIN_ID=?
|
||||
"""))
|
||||
{
|
||||
List<FlagSiteComplaintModel> complaints = new ArrayList<>();
|
||||
|
||||
complaintsStmt.setInt(1, id);
|
||||
ResultSet rs = complaintsStmt.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
complaints.add(new FlagSiteComplaintModel(
|
||||
categoryItemMap.getOrDefault(rs.getString(1), unknownCategory).categoryDesc,
|
||||
rs.getString(2),
|
||||
rs.getBoolean(3),
|
||||
rs.getString(4)));
|
||||
}
|
||||
|
||||
return complaints;
|
||||
}
|
||||
}
|
||||
|
||||
public void insertComplaint(FlagSiteFormData formData) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement(
|
||||
"""
|
||||
INSERT INTO DOMAIN_COMPLAINT(DOMAIN_ID, CATEGORY, DESCRIPTION, SAMPLE) VALUES (?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setInt(1, formData.domainId);
|
||||
stmt.setString(2, formData.category);
|
||||
stmt.setString(3, formData.description);
|
||||
stmt.setString(4, formData.sampleQuery);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public record CategoryItem(String categoryName, String categoryDesc) {}
|
||||
public record FlagSiteComplaintModel(String category, String submitTime, boolean isReviewed, String decision) {}
|
||||
public record FlagSiteFormData(int domainId, String category, String description, String sampleQuery) {}
|
||||
}
|
@@ -0,0 +1,117 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.svc.SearchQueryCountService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** Renders the front page (index) */
|
||||
@Singleton
|
||||
public class SearchFrontPageService {
|
||||
|
||||
private final MustacheRenderer<IndexModel> template;
|
||||
private final HikariDataSource dataSource;
|
||||
private final SearchQueryCountService searchVisitorCount;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public SearchFrontPageService(RendererFactory rendererFactory,
|
||||
HikariDataSource dataSource,
|
||||
SearchQueryCountService searchVisitorCount
|
||||
) throws IOException {
|
||||
this.template = rendererFactory.renderer("search/index/index");
|
||||
this.dataSource = dataSource;
|
||||
this.searchVisitorCount = searchVisitorCount;
|
||||
}
|
||||
|
||||
public String render(Request request, Response response) {
|
||||
response.header("Cache-control", "public,max-age=3600");
|
||||
|
||||
return template.render(new IndexModel(
|
||||
getNewsItems(),
|
||||
searchVisitorCount.getQueriesPerMinute()
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
private List<NewsItem> getNewsItems() {
|
||||
List<NewsItem> items = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT TITLE, LINK, SOURCE, LIST_DATE FROM SEARCH_NEWS_FEED ORDER BY LIST_DATE DESC
|
||||
""")) {
|
||||
|
||||
var rep = stmt.executeQuery();
|
||||
|
||||
while (rep.next()) {
|
||||
items.add(new NewsItem(
|
||||
rep.getString(1),
|
||||
rep.getString(2),
|
||||
rep.getString(3),
|
||||
rep.getDate(4).toLocalDate()));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("Failed to fetch news items", ex);
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
public Object renderNewsFeed(Request request, Response response) {
|
||||
List<NewsItem> newsItems = getNewsItems();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Marginalia Search News and Mentions</title>
|
||||
<link>https://search.marginalia.nu/</link>
|
||||
<description>News and Mentions of Marginalia Search</description>
|
||||
<language>en-us</language>
|
||||
<ttl>60</ttl>
|
||||
""");
|
||||
|
||||
sb.append("<lastBuildDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</lastBuildDate>\n");
|
||||
sb.append("<pubDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
|
||||
sb.append("<ttl>60</ttl>\n");
|
||||
for (var item : newsItems) {
|
||||
sb.append("<item>\n");
|
||||
sb.append("<title>").append(item.title()).append("</title>\n");
|
||||
sb.append("<link>").append(item.url()).append("</link>\n");
|
||||
if (item.source != null) {
|
||||
sb.append("<author>").append(item.source()).append("</author>\n");
|
||||
}
|
||||
sb.append("<pubDate>").append(item.date().atStartOfDay().atZone(ZoneId.systemDefault()).format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
|
||||
sb.append("</item>\n");
|
||||
}
|
||||
sb.append("</channel>\n");
|
||||
sb.append("</rss>\n");
|
||||
|
||||
response.type("application/rss+xml");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private record IndexModel(List<NewsItem> news, int searchPerMinute) { }
|
||||
private record NewsItem(String title, String url, String source, LocalDate date) {}
|
||||
}
|
@@ -0,0 +1,48 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/** Keeps per-minute statistics of queries */
|
||||
@Singleton
|
||||
public class SearchQueryCountService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final AtomicInteger lastMinuteQueries = new AtomicInteger();
|
||||
|
||||
private final TimeUnit minute = TimeUnit.of(ChronoUnit.MINUTES);
|
||||
private volatile int queriesPerMinute;
|
||||
|
||||
public SearchQueryCountService() {
|
||||
Thread updateThread = new Thread(this::updateQueriesPerMinute,
|
||||
"SearchVisitorCountService::updateQueriesPerMinute");
|
||||
updateThread.setDaemon(true);
|
||||
updateThread.start();
|
||||
}
|
||||
|
||||
/** Retreive the number of queries performed the minute before this one */
|
||||
public int getQueriesPerMinute() {
|
||||
return queriesPerMinute;
|
||||
}
|
||||
|
||||
/** Update query statistics for presentation */
|
||||
public void registerQuery() {
|
||||
lastMinuteQueries.incrementAndGet();
|
||||
}
|
||||
|
||||
private void updateQueriesPerMinute() {
|
||||
try {
|
||||
for (;;) {
|
||||
queriesPerMinute = lastMinuteQueries.getAndSet(0);
|
||||
minute.sleep(1);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
logger.warn("Query counter thread was interrupted");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,62 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.command.CommandEvaluator;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.exceptions.RedirectException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
public class SearchQueryService {
|
||||
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final SearchErrorPageService errorPageService;
|
||||
private final CommandEvaluator searchCommandEvaulator;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public SearchQueryService(
|
||||
WebsiteUrl websiteUrl,
|
||||
SearchErrorPageService errorPageService,
|
||||
CommandEvaluator searchCommandEvaulator) {
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.errorPageService = errorPageService;
|
||||
this.searchCommandEvaulator = searchCommandEvaulator;
|
||||
}
|
||||
|
||||
public Object pathSearch(Request request, Response response) {
|
||||
try {
|
||||
return searchCommandEvaulator.eval(response, parseParameters(request));
|
||||
}
|
||||
catch (RedirectException ex) {
|
||||
response.redirect(ex.newUrl);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error", ex);
|
||||
errorPageService.serveError(request, response);
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private SearchParameters parseParameters(Request request) {
|
||||
try {
|
||||
final String queryParam = request.queryParams("query");
|
||||
|
||||
if (null == queryParam || queryParam.isBlank()) {
|
||||
throw new RedirectException(websiteUrl.url());
|
||||
}
|
||||
|
||||
return new SearchParameters(queryParam.trim(), request);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// Bots keep sending bad requests, suppress the error otherwise it will
|
||||
// fill up the logs.
|
||||
|
||||
throw new RedirectException(websiteUrl.url());
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,416 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.api.feeds.RpcFeed;
|
||||
import nu.marginalia.api.feeds.RpcFeedItem;
|
||||
import nu.marginalia.api.livecapture.LiveCaptureClient;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
public class SearchSiteInfoService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class);
|
||||
|
||||
private final SearchOperator searchOperator;
|
||||
private final DomainInfoClient domainInfoClient;
|
||||
private final SearchFlagSiteService flagSiteService;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
private final FeedsClient feedsClient;
|
||||
private final LiveCaptureClient liveCaptureClient;
|
||||
private final ScreenshotService screenshotService;
|
||||
|
||||
@Inject
|
||||
public SearchSiteInfoService(SearchOperator searchOperator,
|
||||
DomainInfoClient domainInfoClient,
|
||||
RendererFactory rendererFactory,
|
||||
SearchFlagSiteService flagSiteService,
|
||||
DbDomainQueries domainQueries,
|
||||
FeedsClient feedsClient,
|
||||
LiveCaptureClient liveCaptureClient,
|
||||
ScreenshotService screenshotService) throws IOException
|
||||
{
|
||||
this.searchOperator = searchOperator;
|
||||
this.domainInfoClient = domainInfoClient;
|
||||
this.flagSiteService = flagSiteService;
|
||||
this.domainQueries = domainQueries;
|
||||
|
||||
this.renderer = rendererFactory.renderer("search/site-info/site-info");
|
||||
|
||||
this.feedsClient = feedsClient;
|
||||
this.liveCaptureClient = liveCaptureClient;
|
||||
this.screenshotService = screenshotService;
|
||||
}
|
||||
|
||||
public Object handle(Request request, Response response) throws SQLException {
|
||||
String domainName = request.params("site");
|
||||
String view = request.queryParamOrDefault("view", "info");
|
||||
|
||||
if (null == domainName || domainName.isBlank()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var model = switch (view) {
|
||||
case "links" -> listLinks(domainName);
|
||||
case "docs" -> listDocs(domainName);
|
||||
case "info" -> listInfo(domainName);
|
||||
case "report" -> reportSite(domainName);
|
||||
default -> listInfo(domainName);
|
||||
};
|
||||
|
||||
return renderer.render(model);
|
||||
}
|
||||
|
||||
public Object handlePost(Request request, Response response) throws SQLException {
|
||||
String domainName = request.params("site");
|
||||
String view = request.queryParamOrDefault("view", "info");
|
||||
|
||||
if (null == domainName || domainName.isBlank()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!view.equals("report"))
|
||||
return null;
|
||||
|
||||
final int domainId = domainQueries.getDomainId(new EdgeDomain(domainName));
|
||||
|
||||
FlagSiteFormData formData = new FlagSiteFormData(
|
||||
domainId,
|
||||
request.queryParams("category"),
|
||||
request.queryParams("description"),
|
||||
request.queryParams("sampleQuery")
|
||||
);
|
||||
flagSiteService.insertComplaint(formData);
|
||||
|
||||
var complaints = flagSiteService.getExistingComplaints(domainId);
|
||||
|
||||
var model = new ReportDomain(domainName, domainId, complaints, List.of(), true);
|
||||
|
||||
return renderer.render(model);
|
||||
}
|
||||
|
||||
private Object reportSite(String domainName) throws SQLException {
|
||||
int domainId = domainQueries.getDomainId(new EdgeDomain(domainName));
|
||||
var existingComplaints = flagSiteService.getExistingComplaints(domainId);
|
||||
|
||||
return new ReportDomain(domainName,
|
||||
domainId,
|
||||
existingComplaints,
|
||||
flagSiteService.getCategories(),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
private Backlinks listLinks(String domainName) {
|
||||
return new Backlinks(domainName,
|
||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||
searchOperator.doBacklinkSearch(domainName));
|
||||
}
|
||||
|
||||
private SiteInfoWithContext listInfo(String domainName) {
|
||||
|
||||
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
|
||||
|
||||
final Future<DomainInformation> domainInfoFuture;
|
||||
final Future<List<SimilarDomain>> similarSetFuture;
|
||||
final Future<List<SimilarDomain>> linkingDomainsFuture;
|
||||
final CompletableFuture<RpcFeed> feedItemsFuture;
|
||||
String url = "https://" + domainName + "/";
|
||||
|
||||
boolean hasScreenshot = screenshotService.hasScreenshot(domainId);
|
||||
|
||||
|
||||
if (domainId < 0) {
|
||||
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
|
||||
similarSetFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
|
||||
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
|
||||
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
|
||||
}
|
||||
else if (!domainInfoClient.isAccepting()) {
|
||||
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
|
||||
similarSetFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
|
||||
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
|
||||
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
|
||||
}
|
||||
else {
|
||||
domainInfoFuture = domainInfoClient.domainInformation(domainId);
|
||||
similarSetFuture = domainInfoClient.similarDomains(domainId, 25);
|
||||
linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25);
|
||||
feedItemsFuture = feedsClient.getFeed(domainId);
|
||||
}
|
||||
|
||||
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, domainId,5);
|
||||
if (!sampleResults.isEmpty()) {
|
||||
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
||||
}
|
||||
|
||||
var result = new SiteInfoWithContext(domainName,
|
||||
domainId,
|
||||
url,
|
||||
hasScreenshot,
|
||||
waitForFuture(domainInfoFuture, () -> createDummySiteInfo(domainName)),
|
||||
waitForFuture(similarSetFuture, List::of),
|
||||
waitForFuture(linkingDomainsFuture, List::of),
|
||||
waitForFuture(feedItemsFuture.thenApply(FeedItems::new), () -> FeedItems.dummyValue(domainName)),
|
||||
sampleResults
|
||||
);
|
||||
|
||||
requestMissingScreenshots(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Request missing screenshots for the given site info */
|
||||
private void requestMissingScreenshots(SiteInfoWithContext result) {
|
||||
|
||||
// Always request the main site screenshot, even if we already have it
|
||||
// as this will make the live-capture do a staleness check and update
|
||||
// as needed.
|
||||
liveCaptureClient.requestScreengrab(result.domainId());
|
||||
|
||||
int requests = 1;
|
||||
|
||||
// Request screenshots for similar and linking domains only if they are absent
|
||||
// also throttle the requests to at most 5 per view.
|
||||
|
||||
if (result.similar() != null) {
|
||||
for (var similar : result.similar()) {
|
||||
if (similar.screenshot()) {
|
||||
continue;
|
||||
}
|
||||
if (++requests > 5) {
|
||||
break;
|
||||
}
|
||||
|
||||
liveCaptureClient.requestScreengrab(similar.domainId());
|
||||
}
|
||||
}
|
||||
|
||||
if (result.linking() != null) {
|
||||
for (var linking : result.linking()) {
|
||||
if (linking.screenshot()) {
|
||||
continue;
|
||||
}
|
||||
if (++requests > 5) {
|
||||
break;
|
||||
}
|
||||
|
||||
liveCaptureClient.requestScreengrab(linking.domainId());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private <T> T waitForFuture(Future<T> future, Supplier<T> fallback) {
|
||||
try {
|
||||
return future.get(250, TimeUnit.MILLISECONDS);
|
||||
} catch (Exception e) {
|
||||
logger.info("Failed to get domain data: {}", e.getMessage());
|
||||
return fallback.get();
|
||||
}
|
||||
}
|
||||
|
||||
private DomainInformation createDummySiteInfo(String domainName) {
|
||||
return DomainInformation.builder()
|
||||
.domain(new EdgeDomain(domainName))
|
||||
.suggestForCrawling(true)
|
||||
.unknownDomain(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
private Docs listDocs(String domainName) {
|
||||
int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
|
||||
return new Docs(domainName,
|
||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||
searchOperator.doSiteSearch(domainName, domainId, 100));
|
||||
}
|
||||
|
||||
public record Docs(Map<String, Boolean> view,
|
||||
String domain,
|
||||
long domainId,
|
||||
List<UrlDetails> results) {
|
||||
public Docs(String domain, long domainId, List<UrlDetails> results) {
|
||||
this(Map.of("docs", true), domain, domainId, results);
|
||||
}
|
||||
|
||||
public String focusDomain() { return domain; }
|
||||
|
||||
public String query() { return "site:" + domain; }
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record Backlinks(Map<String, Boolean> view, String domain, long domainId, List<UrlDetails> results) {
|
||||
public Backlinks(String domain, long domainId, List<UrlDetails> results) {
|
||||
this(Map.of("links", true), domain, domainId, results);
|
||||
}
|
||||
|
||||
public String query() { return "links:" + domain; }
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record SiteInfoWithContext(Map<String, Boolean> view,
|
||||
Map<String, Boolean> domainState,
|
||||
String domain,
|
||||
int domainId,
|
||||
String siteUrl,
|
||||
boolean hasScreenshot,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomain> similar,
|
||||
List<SimilarDomain> linking,
|
||||
FeedItems feed,
|
||||
List<UrlDetails> samples
|
||||
) {
|
||||
public SiteInfoWithContext(String domain,
|
||||
int domainId,
|
||||
String siteUrl,
|
||||
boolean hasScreenshot,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomain> similar,
|
||||
List<SimilarDomain> linking,
|
||||
FeedItems feedInfo,
|
||||
List<UrlDetails> samples
|
||||
)
|
||||
{
|
||||
this(Map.of("info", true),
|
||||
Map.of(domainInfoState(domainInformation), true),
|
||||
domain,
|
||||
domainId,
|
||||
siteUrl,
|
||||
hasScreenshot,
|
||||
domainInformation,
|
||||
similar,
|
||||
linking,
|
||||
feedInfo,
|
||||
samples);
|
||||
}
|
||||
|
||||
public String getLayout() {
|
||||
// My CSS is too weak to handle this in CSS alone, so I guess we're doing layout in Java...
|
||||
if (similar != null && similar.size() < 25) {
|
||||
return "lopsided";
|
||||
}
|
||||
else if (feed != null && !feed.items().isEmpty()) {
|
||||
return "lopsided";
|
||||
}
|
||||
else if (samples != null && !samples.isEmpty()) {
|
||||
return "lopsided";
|
||||
}
|
||||
else {
|
||||
return "balanced";
|
||||
}
|
||||
}
|
||||
|
||||
public String query() { return "site:" + domain; }
|
||||
|
||||
private static String domainInfoState(DomainInformation info) {
|
||||
if (info.isBlacklisted()) {
|
||||
return "blacklisted";
|
||||
}
|
||||
if (!info.isUnknownDomain() && info.isSuggestForCrawling()) {
|
||||
return "suggestForCrawling";
|
||||
}
|
||||
if (info.isInCrawlQueue()) {
|
||||
return "inCrawlQueue";
|
||||
}
|
||||
if (info.isUnknownDomain()) {
|
||||
return "unknownDomain";
|
||||
}
|
||||
else {
|
||||
return "indexed";
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record FeedItem(String title, String date, String description, String url) {
|
||||
|
||||
public FeedItem(RpcFeedItem rpcFeedItem) {
|
||||
this(rpcFeedItem.getTitle(),
|
||||
rpcFeedItem.getDate(),
|
||||
rpcFeedItem.getDescription(),
|
||||
rpcFeedItem.getUrl());
|
||||
}
|
||||
|
||||
public String pubDay() { // Extract the date from an ISO style date string
|
||||
if (date.length() > 10) {
|
||||
return date.substring(0, 10);
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
public String descriptionSafe() {
|
||||
return description
|
||||
.replace("<", "<")
|
||||
.replace(">", ">");
|
||||
}
|
||||
}
|
||||
|
||||
public record FeedItems(String domain, String feedUrl, String updated, List<FeedItem> items) {
|
||||
|
||||
public static FeedItems dummyValue(String domain) {
|
||||
return new FeedItems(domain, "", "", List.of());
|
||||
}
|
||||
|
||||
public FeedItems(RpcFeed rpcFeedItems) {
|
||||
this(rpcFeedItems.getDomain(),
|
||||
rpcFeedItems.getFeedUrl(),
|
||||
rpcFeedItems.getUpdated(),
|
||||
rpcFeedItems.getItemsList().stream().map(FeedItem::new).toList());
|
||||
}
|
||||
}
|
||||
|
||||
public record ReportDomain(
|
||||
Map<String, Boolean> view,
|
||||
String domain,
|
||||
int domainId,
|
||||
List<SearchFlagSiteService.FlagSiteComplaintModel> complaints,
|
||||
List<SearchFlagSiteService.CategoryItem> category,
|
||||
boolean submitted)
|
||||
{
|
||||
public ReportDomain(String domain,
|
||||
int domainId,
|
||||
List<SearchFlagSiteService.FlagSiteComplaintModel> complaints,
|
||||
List<SearchFlagSiteService.CategoryItem> category,
|
||||
boolean submitted) {
|
||||
this(Map.of("report", true), domain, domainId, complaints, category, submitted);
|
||||
}
|
||||
|
||||
public String query() { return "site:" + domain; }
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,73 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.CheckForNull;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Singleton
|
||||
public class SearchUnitConversionService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Pattern conversionPattern = Pattern.compile("((\\d+|\\s+|[.()\\-^+%*/]|log[^a-z]|log2[^a-z]|sqrt[^a-z]|log10|cos[^a-z]|sin[^a-z]|tan[^a-z]|log2|pi[^a-z]|e[^a-z]|2pi[^a-z])+)\\s*([a-zA-Z][a-zA-Z^.0-9]*\\s?[a-zA-Z^.0-9]*)\\s+in\\s+([a-zA-Z^.0-9]+\\s?[a-zA-Z^.0-9]*)");
|
||||
private final Predicate<String> evalPredicate = Pattern.compile("(\\d+|\\s+|[.()\\-^+%*/]|log|log2|sqrt|log10|cos|sin|tan|pi|e|2pi)+").asMatchPredicate();
|
||||
|
||||
private final MathClient mathClient;
|
||||
|
||||
@Inject
|
||||
public SearchUnitConversionService(MathClient mathClient) {
|
||||
this.mathClient = mathClient;
|
||||
}
|
||||
|
||||
public Optional<String> tryConversion(String query) {
|
||||
var matcher = conversionPattern.matcher(query);
|
||||
if (!matcher.matches())
|
||||
return Optional.empty();
|
||||
|
||||
String value = matcher.group(1);
|
||||
String from = matcher.group(3);
|
||||
String to = matcher.group(4);
|
||||
|
||||
logger.info("{} -> '{}' '{}' '{}'", query, value, from, to);
|
||||
|
||||
try {
|
||||
var resultFuture = mathClient.unitConversion(value, from, to);
|
||||
return Optional.of(
|
||||
resultFuture.get(250, TimeUnit.MILLISECONDS)
|
||||
);
|
||||
} catch (ExecutionException e) {
|
||||
logger.error("Error in unit conversion", e);
|
||||
} catch (InterruptedException e) {
|
||||
logger.error("Interrupted while waiting for unit conversion", e);
|
||||
} catch (TimeoutException e) {
|
||||
// Ignore
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public @CheckForNull Future<String> tryEval(String query) {
|
||||
if (!evalPredicate.test(query)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var expr = query.toLowerCase().trim();
|
||||
|
||||
if (expr.chars().allMatch(Character::isDigit)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.info("eval({})", expr);
|
||||
|
||||
return mathClient.evalMath(expr);
|
||||
}
|
||||
}
|
@@ -0,0 +1,3 @@
|
||||
# Search Service
|
||||
|
||||
This is the old search service that search traffic with the old GUI.
|
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.2 KiB |
@@ -0,0 +1,15 @@
|
||||
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/"
|
||||
xmlns:moz="http://www.mozilla.org/2006/browser/search/">
|
||||
<!-- Thanks everyone who has told me about this :) -->
|
||||
|
||||
<!-- By the way, check out https://api.marginalia.nu/ if you wish to automate this,
|
||||
if you try to use the endpoint below you'll probably run into trouble with cloudflare :-/
|
||||
-->
|
||||
<ShortName>Marginalia</ShortName>
|
||||
<Description>Search Marginalia</Description>
|
||||
<InputEncoding>UTF-8</InputEncoding>
|
||||
<Image width="16" height="16" type="image/x-icon">https://old-search.marginalia.nu/favicon.ico</Image>
|
||||
<Url type="text/html" method="get"
|
||||
template="https://old-search.marginalia.nu/search?query={searchTerms}&ref=opensearch"/>
|
||||
<moz:SearchForm>https://old-search.marginalia.nu/</moz:SearchForm>
|
||||
</OpenSearchDescription>
|
Before Width: | Height: | Size: 891 B After Width: | Height: | Size: 891 B |
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user