mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
75 Commits
deploy-027
...
deploy-029
Author | SHA1 | Date | |
---|---|---|---|
|
120209e138 | ||
|
a771a5b6ce | ||
|
dac5b54128 | ||
|
6cfb143c15 | ||
|
23c818281b | ||
|
8aad253cf6 | ||
|
556d7af9dc | ||
|
b7a5219ed3 | ||
|
a23ec521fe | ||
|
fff3babc6d | ||
|
b2bfb8217c | ||
|
3b2ac414dc | ||
|
0ba6515a01 | ||
|
16c6b0f151 | ||
|
e998692900 | ||
|
eeb1695a87 | ||
|
a0ab910940 | ||
|
b9f31048d7 | ||
|
12c304289a | ||
|
6ee01dabea | ||
|
1b80e282a7 | ||
|
a65d18f1d1 | ||
|
90a1ff220b | ||
|
d6c7092335 | ||
|
b716333856 | ||
|
b504b8482c | ||
|
80da1e9ad1 | ||
|
d3f744a441 | ||
|
60fb539875 | ||
|
7f5094fedf | ||
|
45066636a5 | ||
|
e2d6898c51 | ||
|
58ef767b94 | ||
|
f9f268c67a | ||
|
f44c2bdee9 | ||
|
6fdf477c18 | ||
|
6b6e455e3f | ||
|
a3a126540c | ||
|
842b19da40 | ||
|
2a30e93bf0 | ||
|
3d998f12c0 | ||
|
cbccc2ac23 | ||
|
2cfc23f9b7 | ||
|
88fe394cdb | ||
|
f30fcebd4f | ||
|
5d885927b4 | ||
|
7622c8358e | ||
|
69ed9aef47 | ||
|
4c78c223da | ||
|
71b9935dd6 | ||
|
ad38f2fd83 | ||
|
9c47388846 | ||
|
d9ab10e33f | ||
|
e13ea7f42b | ||
|
f38daeb036 | ||
|
6e214293e5 | ||
|
52582a6d7d | ||
|
ec0e39ad32 | ||
|
6a15aee4b0 | ||
|
bd5111e8a2 | ||
|
1ecbeb0272 | ||
|
b91354925d | ||
|
3f85c9c154 | ||
|
89e03d6914 | ||
|
14e0bc9f26 | ||
|
7065b46c6f | ||
|
0372190c90 | ||
|
ceaf32fb90 | ||
|
b57db01415 | ||
|
ce7d522608 | ||
|
18649b6ee9 | ||
|
f6417aef1a | ||
|
2aa7e376b0 | ||
|
f33bc44860 | ||
|
a2826efd44 |
@@ -48,10 +48,6 @@ filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Show favicons next to search results (COMPLETED 2025-03)
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
@@ -5,13 +5,15 @@ import java.util.Collection;
|
||||
public enum HtmlFeature {
|
||||
// Note, the first 32 of these features are bit encoded in the database
|
||||
// so be sure to keep anything that's potentially important toward the top
|
||||
// of the list
|
||||
// of the list; but adding new values will shift the encoded values and break
|
||||
// binary compatibility! Scroll down for a marker where you should add new values
|
||||
// if they need to be accessible from IndexResultScoreCalculator!
|
||||
|
||||
MEDIA( "special:media"),
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING("special:tracking"),
|
||||
TRACKING_ADTECH("special:ads"), // We'll call this ads for now
|
||||
TRACKING_ADTECH("special:adtech"),
|
||||
|
||||
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
||||
LONG_URL("special:longurl"),
|
||||
@@ -30,6 +32,15 @@ public enum HtmlFeature {
|
||||
|
||||
PDF("format:pdf"),
|
||||
|
||||
POPOVER("special:popover"),
|
||||
CONSENT("special:consent"),
|
||||
SHORT_DOCUMENT("special:shorty"),
|
||||
THIRD_PARTY_REQUESTS("special:3pr"),
|
||||
|
||||
// Here! It is generally safe to add additional values here without
|
||||
// disrupting the encoded values used by the DocumentValuator
|
||||
// class in the index!
|
||||
|
||||
/** For fingerprinting and ranking */
|
||||
OPENGRAPH("special:opengraph"),
|
||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||
@@ -67,6 +78,7 @@ public enum HtmlFeature {
|
||||
|
||||
S3_FEATURE("special:s3"),
|
||||
|
||||
MISSING_DOM_SAMPLE("special:nosample"),
|
||||
UNKNOWN("special:uncategorized");
|
||||
|
||||
|
||||
@@ -93,6 +105,8 @@ public enum HtmlFeature {
|
||||
}
|
||||
|
||||
public int getFeatureBit() {
|
||||
if (getClass().desiredAssertionStatus() && ordinal() >= 32)
|
||||
throw new IllegalStateException("Attempting to extract feature bit of " + name() + ", with ordinal " + ordinal());
|
||||
return (1<< ordinal());
|
||||
}
|
||||
}
|
||||
|
@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.Function;
|
||||
|
||||
@Singleton
|
||||
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
|
||||
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
private static final Executor executor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
@Inject
|
||||
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.service.client;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import io.grpc.ManagedChannel;
|
||||
import io.grpc.StatusRuntimeException;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@@ -206,6 +207,11 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
|
||||
for (var e : exceptions) {
|
||||
if (e instanceof StatusRuntimeException se) {
|
||||
throw se; // Re-throw SRE as-is
|
||||
}
|
||||
|
||||
// If there are other exceptions, log them
|
||||
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
|
||||
}
|
||||
|
||||
|
@@ -1,9 +1,9 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.grpc.Server;
|
||||
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
|
||||
import io.grpc.netty.shaded.io.netty.channel.nio.NioEventLoopGroup;
|
||||
import io.grpc.netty.shaded.io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||
import io.grpc.netty.NettyServerBuilder;
|
||||
import io.netty.channel.nio.NioEventLoopGroup;
|
||||
import io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class GrpcServer {
|
||||
private final Server server;
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
public GrpcServer(ServiceConfiguration config,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
ServicePartition partition,
|
||||
@@ -26,13 +31,19 @@ public class GrpcServer {
|
||||
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
|
||||
|
||||
// Start the gRPC server
|
||||
|
||||
ExecutorService workExecutor = useLoom ?
|
||||
Executors.newVirtualThreadPerTaskExecutor() :
|
||||
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
|
||||
|
||||
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
|
||||
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
|
||||
.executor(workExecutor)
|
||||
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
|
||||
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
|
||||
.channelType(NioServerSocketChannel.class);
|
||||
|
||||
for (var grpcService : grpcServices) {
|
||||
|
||||
if (!grpcService.shouldRegisterService()) {
|
||||
continue;
|
||||
}
|
||||
|
@@ -125,8 +125,7 @@ public class JoobyService {
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
options.setWorkerThreads(Math.min(16, options.getWorkerThreads()));
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -36,7 +37,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +53,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,7 +67,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,7 +81,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +99,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -38,7 +39,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,7 +58,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,7 +74,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +88,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,7 +100,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,14 +115,14 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
responseObserver.onError(new IllegalArgumentException("Export all atags is only available on node 1"));
|
||||
responseObserver.onError(Status.UNAVAILABLE.withDescription("Export all atags is only available on node 1").asRuntimeException());
|
||||
}
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
||||
@@ -131,7 +132,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,7 +146,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,7 +160,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.actor.ActorApi;
|
||||
@@ -58,7 +59,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +71,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -82,7 +83,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,7 +97,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,7 +113,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,7 +129,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -203,7 +204,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,7 +230,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,7 +277,7 @@ public class ExecutorGrpcService
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to update nsfw filters", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -33,7 +34,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,7 +49,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,7 +64,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,7 +79,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +94,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
@Singleton
|
||||
public class DomainInfoClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public DomainInfoClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -0,0 +1,114 @@
|
||||
package nu.marginalia.api.domsample;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.StatusRuntimeException;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
|
||||
@Singleton
|
||||
public class DomSampleClient {
|
||||
private final GrpcSingleNodeChannelPool<DomSampleApiGrpc.DomSampleApiBlockingStub> channelPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleClient.class);
|
||||
|
||||
@Inject
|
||||
public DomSampleClient(GrpcChannelPoolFactory factory) {
|
||||
|
||||
// The client is only interested in the primary node
|
||||
var key = ServiceKey.forGrpcApi(DomSampleApiGrpc.class, ServicePartition.any());
|
||||
this.channelPool = factory.createSingle(key, DomSampleApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
public Optional<RpcDomainSample> getSample(String domainName) {
|
||||
try {
|
||||
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSample)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
|
||||
return Optional.of(val);
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||
logger.error("Failed to fetch DOM sample", sre);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
|
||||
try {
|
||||
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
|
||||
return Optional.of(val);
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||
logger.error("Failed to fetch DOM sample", sre);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasSample(String domainName) {
|
||||
try {
|
||||
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build())
|
||||
.getAnswer();
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public CompletableFuture<Boolean> hasSample(String domainName, ExecutorService executor) {
|
||||
try {
|
||||
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
||||
.async(executor)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build())
|
||||
.thenApply(RpcBooleanRsp::getAnswer);
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
return CompletableFuture.completedFuture(false);
|
||||
}
|
||||
}
|
||||
|
||||
public CompletableFuture<RpcDomainSample> getSampleAsync(String domainName, ExecutorService executorService) {
|
||||
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSample)
|
||||
.async(executorService)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
}
|
||||
|
||||
public List<RpcDomainSample> getAllSamples(String domainName) {
|
||||
try {
|
||||
Iterator<RpcDomainSample> val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getAllSamples)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
|
||||
List<RpcDomainSample> ret = new ArrayList<>();
|
||||
val.forEachRemaining(ret::add);
|
||||
return ret;
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
logger.error("Failed to fetch DOM sample");
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean waitReady(Duration duration) throws InterruptedException {
|
||||
return channelPool.awaitChannel(duration);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -24,7 +24,9 @@ import java.util.function.BiConsumer;
|
||||
|
||||
@Singleton
|
||||
public class FeedsClient {
|
||||
private final ExecutorService executorService = Executors.newCachedThreadPool();
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executorService = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||
|
||||
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
|
||||
private final MqOutbox updateFeedsOutbox;
|
||||
|
||||
|
@@ -0,0 +1,47 @@
|
||||
syntax="proto3";
|
||||
package nu.marginalia.api.domsample;
|
||||
|
||||
option java_package="nu.marginalia.api.domsample";
|
||||
option java_multiple_files=true;
|
||||
|
||||
|
||||
service DomSampleApi {
|
||||
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
|
||||
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
|
||||
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
|
||||
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
|
||||
}
|
||||
|
||||
message RpcDomainName {
|
||||
string domainName = 1;
|
||||
}
|
||||
|
||||
message RpcBooleanRsp {
|
||||
bool answer = 1;
|
||||
}
|
||||
|
||||
message RpcDomainSampleRequests {
|
||||
string domainName = 1;
|
||||
string url = 2;
|
||||
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||
}
|
||||
|
||||
message RpcDomainSample {
|
||||
string domainName = 1;
|
||||
string url = 2;
|
||||
bytes htmlSampleZstd = 3;
|
||||
bool accepted_popover = 4;
|
||||
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||
}
|
||||
|
||||
message RpcOutgoingRequest {
|
||||
RequestMethod method = 1;
|
||||
int64 timestamp = 2;
|
||||
string url = 3;
|
||||
|
||||
enum RequestMethod {
|
||||
GET = 0;
|
||||
POST = 1;
|
||||
OTHER = 2;
|
||||
};
|
||||
}
|
@@ -31,6 +31,7 @@ dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.slop
|
||||
implementation libs.zstd
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
|
@@ -0,0 +1,176 @@
|
||||
package nu.marginalia.domsample;
|
||||
|
||||
import com.github.luben.zstd.Zstd;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.protobuf.ByteString;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.domsample.*;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
public class DomSampleGrpcService
|
||||
extends DomSampleApiGrpc.DomSampleApiImplBase
|
||||
implements DiscoverableService
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleGrpcService.class);
|
||||
|
||||
private final DomSampleDb domSampleDb;
|
||||
|
||||
@Inject
|
||||
public DomSampleGrpcService(DomSampleDb domSampleDb) {
|
||||
this.domSampleDb = domSampleDb;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getSample(RpcDomainName request, StreamObserver<RpcDomainSample> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||
if (dbRecords.isEmpty()) {
|
||||
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
// Grab the first sample
|
||||
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
|
||||
|
||||
responseObserver.onNext(response.build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error in getSample()", e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||
if (dbRecords.isEmpty()) {
|
||||
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
// Grab the first sample
|
||||
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
|
||||
|
||||
responseObserver.onNext(response.build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error in getSample()", e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void hasSample(RpcDomainName request, StreamObserver<RpcBooleanRsp> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
responseObserver.onNext(RpcBooleanRsp.newBuilder()
|
||||
.setAnswer(domSampleDb.hasSample(domainName)).build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getAllSamples(RpcDomainName request, StreamObserver<RpcDomainSample> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||
|
||||
for (var record : dbRecords) {
|
||||
responseObserver.onNext(convertFullSample(record).build());
|
||||
}
|
||||
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error in getSample()", e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
|
||||
|
||||
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
|
||||
|
||||
var sampleBuilder = RpcDomainSample.newBuilder()
|
||||
.setDomainName(dbSample.domain())
|
||||
.setAcceptedPopover(dbSample.acceptedPopover())
|
||||
.setHtmlSampleZstd(htmlZstd);
|
||||
|
||||
for (var req : dbSample.parseRequests()) {
|
||||
sampleBuilder.addOutgoingRequestsBuilder()
|
||||
.setUrl(req.uri().toString())
|
||||
.setMethod(switch (req.method().toUpperCase())
|
||||
{
|
||||
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||
})
|
||||
.setTimestamp(req.timestamp());
|
||||
}
|
||||
|
||||
return sampleBuilder;
|
||||
}
|
||||
|
||||
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
|
||||
|
||||
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
|
||||
.setDomainName(dbSample.domain());
|
||||
|
||||
for (var req : dbSample.parseRequests()) {
|
||||
sampleBuilder.addOutgoingRequestsBuilder()
|
||||
.setUrl(req.uri().toString())
|
||||
.setMethod(switch (req.method().toUpperCase())
|
||||
{
|
||||
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||
})
|
||||
.setTimestamp(req.timestamp());
|
||||
}
|
||||
|
||||
return sampleBuilder;
|
||||
}
|
||||
}
|
@@ -1,17 +1,28 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class DomSampleDb implements AutoCloseable {
|
||||
private static final String dbFileName = "dom-sample.db";
|
||||
private final Connection connection;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleDb.class);
|
||||
|
||||
public DomSampleDb() throws SQLException{
|
||||
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||
@@ -88,7 +99,71 @@ public class DomSampleDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {
|
||||
|
||||
public List<SampleRequest> parseRequests() {
|
||||
List<SampleRequest> requests = new ArrayList<>();
|
||||
|
||||
// Request format is METHOD\tTIMESTAMP\tURI\n
|
||||
|
||||
for (var line : StringUtils.split(this.requests, '\n')) {
|
||||
String[] parts = StringUtils.split(line, "\t", 3);
|
||||
if (parts.length != 3) continue;
|
||||
|
||||
try {
|
||||
String method = parts[0];
|
||||
long ts = Long.parseLong(parts[1]);
|
||||
String linkUrl = parts[2];
|
||||
|
||||
URI uri = parseURI(linkUrl);
|
||||
|
||||
requests.add(new SampleRequest(method, ts, uri));
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Failed to parse requests", e);
|
||||
}
|
||||
}
|
||||
|
||||
return requests;
|
||||
}
|
||||
|
||||
|
||||
private static URI parseURI(String uri) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(uri);
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
return new EdgeUrl(uri).asURI();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public record SampleRequest(String method, long timestamp, URI uri) {}
|
||||
|
||||
/**
|
||||
* @param consumer - consume the sample, return true to continue consumption
|
||||
* @throws SQLException
|
||||
*/
|
||||
public void forEachSample(Predicate<Sample> consumer) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT url, domain, sample, requests, accepted_popover
|
||||
FROM samples
|
||||
"""))
|
||||
{
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
var sample = new Sample(
|
||||
rs.getString("url"),
|
||||
rs.getString("domain"),
|
||||
rs.getString("sample"),
|
||||
rs.getString("requests"),
|
||||
rs.getBoolean("accepted_popover")
|
||||
);
|
||||
|
||||
if (!consumer.test(sample)) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<Sample> getSamples(String domain) throws SQLException {
|
||||
List<Sample> samples = new ArrayList<>();
|
||||
@@ -116,6 +191,21 @@ public class DomSampleDb implements AutoCloseable {
|
||||
return samples;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasSample(String domain) throws SQLException {
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT 1
|
||||
FROM samples
|
||||
WHERE domain = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
return rs.next();
|
||||
}
|
||||
}
|
||||
|
||||
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||
var doc = Jsoup.parse(rawContent);
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.feeds.*;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
@@ -69,7 +70,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
@Override
|
||||
public void getFeedDataHash(Empty request, StreamObserver<RpcFeedDataHash> responseObserver) {
|
||||
if (!feedDb.isEnabled()) {
|
||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
||||
responseObserver.onError(Status.INTERNAL.withDescription("Feed database is disabled on this node").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -80,7 +81,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting feed data hash", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,7 +102,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting updated links", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,13 +110,13 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
public void getFeed(RpcDomainId request,
|
||||
StreamObserver<RpcFeed> responseObserver) {
|
||||
if (!feedDb.isEnabled()) {
|
||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
||||
responseObserver.onError(Status.INTERNAL.withDescription("Feed database is disabled on this node").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
Optional<EdgeDomain> domainName = domainQueries.getDomain(request.getDomainId());
|
||||
if (domainName.isEmpty()) {
|
||||
responseObserver.onError(new IllegalArgumentException("Domain not found"));
|
||||
responseObserver.onError(Status.NOT_FOUND.withDescription("Domain not found").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -26,7 +26,9 @@ public class MathClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(MathClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<MathApiGrpc.MathApiBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public MathClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.functions.searchquery;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
@@ -93,7 +94,7 @@ public class QueryGRPCService
|
||||
});
|
||||
} catch (Exception e) {
|
||||
logger.error("Exception", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -38,7 +38,9 @@ public class IndexClient {
|
||||
.help("Count of results filtered by NSFW tier")
|
||||
.register();
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import nu.marginalia.ranking.domains.jgrapht.PersonalizedPageRank;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.domainranking.data.GraphSource;
|
||||
import nu.marginalia.domainranking.jgrapht.PersonalizedPageRank;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
|
||||
import org.jgrapht.alg.scoring.PageRank;
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultAccumulator;
|
||||
|
||||
import java.util.function.Supplier;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
public interface RankingResultAccumulator<T> {
|
||||
void add(int domainId, int rank);
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import org.jgrapht.Graph;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import org.jgrapht.Graph;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.jgrapht;
|
||||
package nu.marginalia.domainranking.jgrapht;
|
||||
|
||||
/*
|
||||
* (C) Copyright 2016-2023, by Dimitrios Michail and Contributors.
|
||||
@@ -21,8 +21,9 @@ package nu.marginalia.ranking.domains.jgrapht;
|
||||
|
||||
/* (modified by @vlofgren to add personalization) */
|
||||
|
||||
import org.jgrapht.*;
|
||||
import org.jgrapht.alg.interfaces.*;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.Graphs;
|
||||
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
|
||||
|
||||
import java.util.*;
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.index;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Gauge;
|
||||
@@ -148,7 +149,7 @@ public class IndexGrpcService
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
responseObserver.onError(ex);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(ex).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -551,9 +551,18 @@ public class IndexResultScoreCalculator {
|
||||
largeSiteFactor = 2;
|
||||
}
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.ADVERTISEMENT.getFeatureBit()))
|
||||
penalty += 7.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.CONSENT.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.POPOVER.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
@@ -563,6 +572,9 @@ public class IndexResultScoreCalculator {
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.SHORT_DOCUMENT.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (isForum || isWiki) {
|
||||
penalty = Math.min(0, penalty - 2);
|
||||
}
|
||||
|
@@ -6,14 +6,14 @@ import gnu.trove.list.TIntList;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import nu.marginalia.db.DomainRankingSetsService;
|
||||
import nu.marginalia.db.DomainTypes;
|
||||
import nu.marginalia.domainranking.PageRankDomainRanker;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultHashSetAccumulator;
|
||||
import nu.marginalia.domainranking.data.GraphSource;
|
||||
import nu.marginalia.domainranking.data.LinkGraphSource;
|
||||
import nu.marginalia.domainranking.data.SimilarityGraphSource;
|
||||
import nu.marginalia.index.IndexFactory;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.ranking.domains.PageRankDomainRanker;
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultHashSetAccumulator;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import nu.marginalia.ranking.domains.data.LinkGraphSource;
|
||||
import nu.marginalia.ranking.domains.data.SimilarityGraphSource;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultListAccumulator;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultListAccumulator;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
@@ -1,12 +1,12 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.ranking.domains.data.InvertedLinkGraphSource;
|
||||
import nu.marginalia.ranking.domains.data.LinkGraphSource;
|
||||
import nu.marginalia.ranking.domains.data.SimilarityGraphSource;
|
||||
import nu.marginalia.domainranking.data.InvertedLinkGraphSource;
|
||||
import nu.marginalia.domainranking.data.LinkGraphSource;
|
||||
import nu.marginalia.domainranking.data.SimilarityGraphSource;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultWeightedEdge;
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import nu.marginalia.domainranking.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import nu.marginalia.domainranking.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import nu.marginalia.domainranking.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
@@ -47,11 +47,14 @@ dependencies {
|
||||
|
||||
implementation project(':code:processes:converting-process:ft-anchor-keywords')
|
||||
implementation project(':code:processes:converting-process:ft-keyword-extraction')
|
||||
implementation project(':code:processes:converting-process:ft-dom-classifier')
|
||||
|
||||
implementation project(':code:processes:crawling-process:ft-crawl-blocklist')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:processes:crawling-process:model')
|
||||
|
||||
@@ -87,6 +90,7 @@ dependencies {
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.commons.compress
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.grpc
|
||||
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
|
@@ -0,0 +1,41 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
implementation libs.zstd
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.trove
|
||||
implementation libs.gson
|
||||
implementation libs.bundles.protobuf
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.duckdb
|
||||
implementation libs.notnull
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
@@ -0,0 +1,99 @@
|
||||
package nu.marginalia.ddtrackergradar;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.ddtrackergradar.model.DDGTDomain;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
/** Holds tracker metadata from DuckDuckGo's Tracker Radar
|
||||
* data itself CC-BY-NC-SA 4.0
|
||||
* */
|
||||
public class DDGTrackerData {
|
||||
private final Map<String, DDGTDomain> topDomains = new HashMap<>();
|
||||
private final Map<String, DDGTDomain> domains = new HashMap<>();
|
||||
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DDGTrackerData.class);
|
||||
|
||||
public DDGTrackerData() {
|
||||
|
||||
// Data is assumed to be in ${WMSA_HOME}/data/tracker-radar
|
||||
// ... do a shallow clone of the repo
|
||||
// https://github.com/duckduckgo/tracker-radar/
|
||||
|
||||
Path dataDir = WmsaHome.getDataPath().resolve("tracker-radar");
|
||||
if (!Files.exists(dataDir)) {
|
||||
logger.info("tracker-radar data absent from expected path {}, loading nothing", dataDir);
|
||||
return;
|
||||
}
|
||||
|
||||
try (var sources = Files.list(dataDir.resolve("domains"))) {
|
||||
sources.filter(Files::isDirectory).forEach(this::loadDomainDir);
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Failed to read tracker radar data dir", e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Tries to fetch available information about tracking coming from the specified domain
|
||||
*/
|
||||
public Optional<DDGTDomain> getDomainInfo(String domain) {
|
||||
return Optional
|
||||
.ofNullable(topDomains.get(domain))
|
||||
.or(() -> Optional.ofNullable(domains.get(domain)));
|
||||
}
|
||||
|
||||
/** public for testing */
|
||||
public void loadDomainDir(Path dir) {
|
||||
try (var dirContent = Files.list(dir)) {
|
||||
dirContent
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(path -> path.toString().endsWith(".json"))
|
||||
.forEach(this::loadDomainModel);
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Error while loading DDGT tracker data", e);
|
||||
}
|
||||
}
|
||||
|
||||
void loadDomainModel(Path jsonFile) {
|
||||
try {
|
||||
var model = gson.fromJson(Files.readString(jsonFile), DDGTDomain.class);
|
||||
|
||||
if (model.domain() == null)
|
||||
return;
|
||||
if ((model.owner() == null || model.owner().isEmpty())
|
||||
&& (model.categories() == null || model.categories().isEmpty()))
|
||||
return;
|
||||
|
||||
topDomains.put(model.domain(), model);
|
||||
domains.put(model.domain(), model);
|
||||
|
||||
if (model.subdomains() != null) {
|
||||
for (String subdomain : model.subdomains()) {
|
||||
domains.put(subdomain + "." + model.domain(), model);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error while loading DDGT tracker data", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Export all classifications in the data set
|
||||
public Set<String> getAllClassifications() {
|
||||
Set<String> ret = new HashSet<>();
|
||||
for (var domain: domains.values()) {
|
||||
ret.addAll(domain.categories());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.ddtrackergradar.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DDGTDomain(
|
||||
String domain,
|
||||
DDGTOwner owner,
|
||||
List<String> categories,
|
||||
List<String> subdomains
|
||||
)
|
||||
{
|
||||
}
|
@@ -0,0 +1,10 @@
|
||||
package nu.marginalia.ddtrackergradar.model;
|
||||
|
||||
public record DDGTOwner(String name, String displayName, String privacyPolicy, String url) {
|
||||
public boolean isEmpty() {
|
||||
return name == null
|
||||
&& displayName == null
|
||||
&& privacyPolicy == null
|
||||
&& url == null;
|
||||
}
|
||||
}
|
@@ -0,0 +1,25 @@
|
||||
package nu.marginalia.domclassifier;
|
||||
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
/**
|
||||
* Feature classifications for the DOM sample
|
||||
*/
|
||||
public enum DomSampleClassification {
|
||||
ADS(HtmlFeature.ADVERTISEMENT),
|
||||
TRACKING(HtmlFeature.TRACKING_ADTECH),
|
||||
CONSENT(HtmlFeature.CONSENT),
|
||||
POPOVER(HtmlFeature.POPOVER),
|
||||
THIRD_PARTY_REQUESTS(HtmlFeature.THIRD_PARTY_REQUESTS),
|
||||
UNCLASSIFIED(HtmlFeature.MISSING_DOM_SAMPLE),
|
||||
IGNORE(null);
|
||||
|
||||
@Nullable
|
||||
public final HtmlFeature htmlFeature;
|
||||
|
||||
DomSampleClassification(@Nullable HtmlFeature feature) {
|
||||
this.htmlFeature = feature;
|
||||
}
|
||||
}
|
@@ -0,0 +1,185 @@
|
||||
package nu.marginalia.domclassifier;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domsample.RpcDomainSample;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Singleton
|
||||
public class DomSampleClassifier {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleClassifier.class);
|
||||
|
||||
private final List<Map.Entry<Predicate<String>, DomSampleClassification>> regexClassification = new ArrayList<>();
|
||||
private final Map<String, DomSampleClassification> urlClassification = new HashMap<>();
|
||||
private final Map<String, DomSampleClassification> topDomainClassification = new HashMap<>();
|
||||
private final Map<String, DomSampleClassification> fullDomainClassification = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public DomSampleClassifier() throws ParserConfigurationException, IOException, SAXException {
|
||||
this(ClassLoader.getSystemResourceAsStream("request-classifier.xml"));
|
||||
}
|
||||
|
||||
public DomSampleClassifier(InputStream specificationXmlData) throws ParserConfigurationException, IOException, SAXException {
|
||||
Objects.requireNonNull(specificationXmlData, "specificationXmlData is null");
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
Document doc = builder.parse(specificationXmlData);
|
||||
|
||||
NodeList classifierNodes = doc.getElementsByTagName("classifier");
|
||||
|
||||
for (int i = 0; i < classifierNodes.getLength(); i++) {
|
||||
Element classifier = (Element) classifierNodes.item(i);
|
||||
|
||||
String target = classifier.getAttribute("target");
|
||||
String rule = classifier.getAttribute("rule");
|
||||
String content = classifier.getTextContent().trim();
|
||||
|
||||
// Convert rule to Classification enum
|
||||
DomSampleClassification classification = DomSampleClassification.valueOf(rule.toUpperCase());
|
||||
|
||||
// Add to appropriate map based on target
|
||||
switch (target) {
|
||||
case "url":
|
||||
urlClassification.put(content, classification);
|
||||
break;
|
||||
case "url-regex":
|
||||
regexClassification.add(Map.entry(Pattern.compile(content).asPredicate(), classification));
|
||||
break;
|
||||
case "top":
|
||||
topDomainClassification.put(content, classification);
|
||||
break;
|
||||
case "domain":
|
||||
fullDomainClassification.put(content, classification);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown target type: " + target);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Set<DomSampleClassification> classifySample(RpcDomainSample sample) {
|
||||
Set<DomSampleClassification> classifications = new HashSet<>();
|
||||
|
||||
// Look at DOM
|
||||
|
||||
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
|
||||
|
||||
byte[] sampleBytes = sample.getHtmlSampleZstd().toByteArray();
|
||||
if (sampleBytes.length >= 8) {
|
||||
logger.info("sampleBytes magic fingerprint: {}", Arrays.toString(Arrays.copyOf(sampleBytes, 8)));
|
||||
}
|
||||
else {
|
||||
logger.info("sampleBytes too short! Was {}", Arrays.toString(sampleBytes));
|
||||
}
|
||||
try (var compressedStream = new ZstdInputStream(new ByteArrayInputStream(sampleBytes))) {
|
||||
String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
|
||||
var parsedDoc = Jsoup.parse(html);
|
||||
var fixedElements = parsedDoc.select("*[data-position=fixed]");
|
||||
|
||||
if (sample.getAcceptedPopover()) {
|
||||
classifications.add(DomSampleClassification.POPOVER);
|
||||
}
|
||||
else if (!fixedElements.isEmpty()) {
|
||||
String fixedText = fixedElements.text().toLowerCase();
|
||||
if (fixedText.contains("cookie") ||
|
||||
fixedText.contains("subscribe") ||
|
||||
fixedText.contains("consent") ||
|
||||
fixedText.contains("newsletter") ||
|
||||
fixedText.contains("gdpr"))
|
||||
{
|
||||
classifications.add(DomSampleClassification.POPOVER);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Error when parsing DOM HTML sample for size" + sample.getHtmlSampleZstd().size(), ex);
|
||||
}
|
||||
|
||||
// Classify outgoing requests
|
||||
for (var req : sample.getOutgoingRequestsList()) {
|
||||
EdgeUrl url;
|
||||
|
||||
try {
|
||||
url = new EdgeUrl(req.getUrl());
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!url.domain.hasSameTopDomain(sampleDomain)) {
|
||||
classifications.add(DomSampleClassification.THIRD_PARTY_REQUESTS);
|
||||
}
|
||||
|
||||
var clazz = classifyRequest(url);
|
||||
if (clazz != DomSampleClassification.IGNORE && clazz != DomSampleClassification.UNCLASSIFIED) {
|
||||
classifications.add(clazz);
|
||||
}
|
||||
}
|
||||
|
||||
return classifications;
|
||||
}
|
||||
|
||||
public DomSampleClassification classifyRequest(EdgeUrl edgeUrl) {
|
||||
StringBuilder pathSb = new StringBuilder(edgeUrl.path);
|
||||
if (edgeUrl.param != null) {
|
||||
pathSb.append("?").append(edgeUrl.param);
|
||||
}
|
||||
String pathMatchString = pathSb.toString();
|
||||
String urlDisplayString = edgeUrl.toDisplayString();
|
||||
|
||||
for (Map.Entry<Predicate<String>, DomSampleClassification> regexMatcher : regexClassification) {
|
||||
var matcher = regexMatcher.getKey();
|
||||
if (matcher.test(pathMatchString) || matcher.test(urlDisplayString)) {
|
||||
var clazz = regexMatcher.getValue();
|
||||
|
||||
if (clazz != DomSampleClassification.IGNORE) {
|
||||
return clazz;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DomSampleClassification clazz = urlClassification.get(edgeUrl.toDisplayString());
|
||||
|
||||
if (clazz != null && clazz != DomSampleClassification.IGNORE) {
|
||||
return clazz;
|
||||
}
|
||||
|
||||
clazz = fullDomainClassification.get(edgeUrl.domain.toString());
|
||||
|
||||
if (clazz != null && clazz != DomSampleClassification.IGNORE) {
|
||||
return clazz;
|
||||
}
|
||||
|
||||
clazz = topDomainClassification.get(edgeUrl.domain.topDomain);
|
||||
|
||||
if (clazz != null && clazz != DomSampleClassification.IGNORE) {
|
||||
return clazz;
|
||||
}
|
||||
|
||||
return DomSampleClassification.UNCLASSIFIED;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,8 @@
|
||||
Holds a classification model for rendered DOM data and exported network traffic generated by
|
||||
[functions/live-capture](../../../functions/live-capture).
|
||||
|
||||
The model is primarily used in the [converting-process](../../converting-process) but also run in the search UI for inspection purposes.
|
||||
|
||||
The traffic classification model is found in [resources/request-classifier.xml](resources/request-classifier.xml).
|
||||
|
||||
The code evaluating the model is in [DomSampleClassifier.java](java/nu/marginalia/domclassifier/DomSampleClassifier.java).
|
@@ -0,0 +1,112 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE rules [
|
||||
<!ELEMENT rules (classifier*)>
|
||||
<!ELEMENT classifier (#PCDATA)>
|
||||
<!ATTLIST classifier
|
||||
target (url-regex|url|domain|top) #REQUIRED
|
||||
rule (ads|tracking|consent|ignore) #REQUIRED>
|
||||
]>
|
||||
|
||||
<!-- Contains rules for mapping outgoing requests during DOM Sampling to website classification -->
|
||||
<rules>
|
||||
<!-- Regex rules -->
|
||||
<classifier target="url-regex" rule="tracking">/ads/ga-audiences</classifier>
|
||||
<classifier target="url-regex" rule="tracking">/google_top_exp.js$</classifier>
|
||||
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
|
||||
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
|
||||
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
|
||||
<classifier target="url-regex" rule="tracking">^/pixel\.gif$</classifier>
|
||||
<classifier target="url-regex" rule="ads">/pagead/</classifier>
|
||||
<classifier target="url-regex" rule="ads">/google-ads/</classifier>
|
||||
|
||||
<!-- URL classifications TRACKING -->
|
||||
<classifier target="url" rule="tracking">https://googleads.g.doubleclick.net/pagead/id</classifier>
|
||||
<classifier target="url" rule="tracking">https://securepubads.g.doubleclick.net/tag/js/gpt.js</classifier>
|
||||
<classifier target="url" rule="tracking">https://pagead2.googlesyndication.com/ccm/collect</classifier>
|
||||
<classifier target="url" rule="tracking">https://z-na.amazon-adsystem.com/widgets/onejs</classifier>
|
||||
|
||||
<!-- Full domain classifications ADS -->
|
||||
|
||||
<classifier target="domain" rule="ads">securepubads.g.doubleclick.net</classifier>
|
||||
<classifier target="domain" rule="ads">googleads.g.doubleclick.net</classifier>
|
||||
|
||||
<!-- Full domain classifications TRACKING -->
|
||||
<classifier target="domain" rule="tracking">stats.g.doubleclick.net</classifier>
|
||||
<classifier target="domain" rule="tracking">insight.adsrvr.org</classifier>
|
||||
|
||||
<classifier target="domain" rule="tracking">pixel.wp.com</classifier>
|
||||
<classifier target="domain" rule="tracking">connect.facebook.net</classifier>
|
||||
<classifier target="domain" rule="tracking">stats.wp.com</classifier>
|
||||
<classifier target="domain" rule="tracking">track.hubspot.com</classifier>
|
||||
<classifier target="domain" rule="tracking">analytics.tiktok.com</classifier>
|
||||
<classifier target="domain" rule="tracking">analytics-ipv6.tiktokw.us</classifier>
|
||||
<classifier target="domain" rule="tracking">tr6.snapchat.com</classifier>
|
||||
<classifier target="domain" rule="tracking">tr.snapchat.com</classifier>
|
||||
<classifier target="domain" rule="tracking">geo-location.prebid.cloud</classifier>
|
||||
<classifier target="domain" rule="tracking">px.ads.linkedin.com</classifier>
|
||||
<classifier target="domain" rule="tracking">region1.analytics.google.com</classifier>
|
||||
<classifier target="domain" rule="tracking">api.hubapi.com</classifier>
|
||||
<classifier target="domain" rule="tracking">bat.bing.com</classifier>
|
||||
<classifier target="domain" rule="tracking">bat.bing.net</classifier>
|
||||
<classifier target="domain" rule="tracking">c.bing.com</classifier>
|
||||
<classifier target="domain" rule="tracking">c.bing.net</classifier>
|
||||
<classifier target="domain" rule="tracking">analytics.twitter.com</classifier>
|
||||
<classifier target="domain" rule="tracking">play.google.com</classifier>
|
||||
<classifier target="domain" rule="tracking">www.youtube.com</classifier>
|
||||
|
||||
<!-- Full domain classifications CONSENT -->
|
||||
<classifier target="domain" rule="consent">cdnconsents.websitepolicies.com</classifier>
|
||||
|
||||
<!-- Top-level domain classifications - ADS -->
|
||||
<classifier target="top" rule="ads">googlesyndication.com</classifier>
|
||||
<classifier target="top" rule="ads">amazon-adsystem.com</classifier>
|
||||
<classifier target="top" rule="ads">smartadserver.com</classifier>
|
||||
<classifier target="top" rule="ads">googleadservices.com</classifier>
|
||||
|
||||
<classifier target="top" rule="ads">prebid.cloud</classifier>
|
||||
<classifier target="top" rule="ads">pubmine.com</classifier>
|
||||
<classifier target="top" rule="ads">adtrafficquality.google</classifier>
|
||||
<classifier target="top" rule="ads">syndicatedsearch.goog</classifier>
|
||||
<classifier target="top" rule="ads">adsrvr.org</classifier>
|
||||
<classifier target="top" rule="ads">adnxs.net</classifier>
|
||||
<classifier target="top" rule="ads">aditude.io</classifier>
|
||||
<classifier target="top" rule="ads">buysellads.net</classifier>
|
||||
|
||||
<!-- Top-level domain classifications - TRACKING -->
|
||||
<classifier target="top" rule="tracking">plausible.io</classifier>
|
||||
<classifier target="top" rule="tracking">amplitude.com</classifier>
|
||||
<classifier target="top" rule="tracking">hsadspixel.net</classifier>
|
||||
<classifier target="top" rule="tracking">demdex.net</classifier>
|
||||
<classifier target="top" rule="tracking">omtrdc.net</classifier>
|
||||
<classifier target="top" rule="tracking">ggpht.com</classifier>
|
||||
|
||||
<classifier target="top" rule="tracking">doubleclick.net</classifier>
|
||||
<classifier target="top" rule="tracking">google.com</classifier>
|
||||
<classifier target="top" rule="tracking">google.se</classifier>
|
||||
<classifier target="top" rule="tracking">google-analytics.com</classifier>
|
||||
<classifier target="top" rule="tracking">googletagmanager.com</classifier>
|
||||
<classifier target="top" rule="tracking">cloudflareinsights.com</classifier>
|
||||
<classifier target="top" rule="tracking">branch.io</classifier>
|
||||
<classifier target="top" rule="tracking">clarity.ms</classifier>
|
||||
<classifier target="top" rule="tracking">hotjar.com</classifier>
|
||||
<classifier target="top" rule="tracking">hotjar.io</classifier>
|
||||
<classifier target="top" rule="tracking">nr-data.net</classifier>
|
||||
<classifier target="top" rule="tracking">newrelic.com</classifier>
|
||||
<classifier target="top" rule="tracking">siteimproveanalytics.com</classifier>
|
||||
<classifier target="top" rule="tracking">siteimproveanalytics.io</classifier>
|
||||
<classifier target="top" rule="tracking">hs-analytics.net</classifier>
|
||||
<classifier target="top" rule="tracking">sentry.io</classifier>
|
||||
<classifier target="top" rule="tracking">hs-scripts.com</classifier>
|
||||
<classifier target="top" rule="tracking">addtoany.com</classifier>
|
||||
<classifier target="top" rule="tracking">facebook.com</classifier>
|
||||
<classifier target="top" rule="tracking">scorecardresearch.com</classifier>
|
||||
|
||||
<!-- Top-level domain classifications - CONSENT -->
|
||||
<classifier target="top" rule="consent">trustarc.com</classifier>
|
||||
<classifier target="top" rule="consent">truste.com</classifier>
|
||||
<classifier target="top" rule="consent">onetrust.com</classifier>
|
||||
<classifier target="top" rule="consent">cookielaw.org</classifier>
|
||||
<classifier target="top" rule="consent">hs-banner.com</classifier>
|
||||
<classifier target="top" rule="consent">fundingchoicesmessages.google.com</classifier>
|
||||
|
||||
</rules>
|
@@ -0,0 +1,15 @@
|
||||
package nu.marginalia.ddtrackergradar;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
class DDGTrackerDataTest {
|
||||
@Test
|
||||
public void testLoad() {
|
||||
DDGTrackerData data = new DDGTrackerData();
|
||||
data.loadDomainDir(Path.of("/home/vlofgren/Work/tracker-radar/domains/US/"));
|
||||
data.getDomainInfo("hotjar.com").ifPresent(System.out::println);
|
||||
data.getAllClassifications().forEach(System.out::println);
|
||||
}
|
||||
}
|
@@ -113,6 +113,13 @@ public class DocumentKeywordsBuilder {
|
||||
newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta));
|
||||
}
|
||||
|
||||
public void addSyntheticTerm(String newWord) {
|
||||
byte meta = WordFlags.Synthetic.asBit();
|
||||
|
||||
wordToMeta.putIfAbsent(newWord, meta);
|
||||
}
|
||||
|
||||
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
|
@@ -23,6 +23,7 @@ import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import nu.marginalia.worklog.BatchingWorkLog;
|
||||
@@ -59,6 +60,7 @@ public class ConverterMain extends ProcessMainClass {
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConverterModule(),
|
||||
new ProcessConfigurationModule("converter"),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
|
@@ -5,10 +5,12 @@ import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.classifier.AcceptableAds;
|
||||
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||
import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
|
||||
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@@ -22,7 +24,6 @@ import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -60,6 +61,7 @@ public class DocumentProcessor {
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument,
|
||||
EdgeDomain domain,
|
||||
DomainLinks externalDomainLinks,
|
||||
Set<DomSampleClassification> domSampleClassifications,
|
||||
DocumentDecorator documentDecorator) {
|
||||
ProcessedDocument ret = new ProcessedDocument();
|
||||
|
||||
@@ -79,7 +81,38 @@ public class DocumentProcessor {
|
||||
default -> DocumentClass.EXTERNALLY_LINKED_MULTI;
|
||||
};
|
||||
|
||||
processDocument(crawledDocument, documentClass, documentDecorator, externalDomainLinks, ret);
|
||||
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
|
||||
|
||||
if (crawlerStatus != CrawlerDocumentStatus.OK)
|
||||
throw new DisqualifiedException(crawlerStatus);
|
||||
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument))
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ACCEPTABLE_ADS);
|
||||
if (!isAcceptedContentType(crawledDocument))
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.CONTENT_TYPE);
|
||||
|
||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||
|
||||
LinkTexts linkTexts = anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, ret.url);
|
||||
|
||||
var detailsWithWords =
|
||||
findPlugin(crawledDocument)
|
||||
.createDetails(crawledDocument, linkTexts, domSampleClassifications, documentClass);
|
||||
|
||||
ret.details = detailsWithWords.details();
|
||||
ret.words = detailsWithWords.words();
|
||||
|
||||
if (ret.url.path.equals("/")) {
|
||||
ret.words.addMeta("special:root", WordFlags.Synthetic.asBit());
|
||||
}
|
||||
|
||||
documentDecorator.apply(ret);
|
||||
|
||||
if (Boolean.TRUE.equals(crawledDocument.hasCookies)
|
||||
&& ret.details != null
|
||||
&& ret.details.features != null)
|
||||
{
|
||||
ret.details.features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
@@ -89,60 +122,12 @@ public class DocumentProcessor {
|
||||
catch (Exception ex) {
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString();
|
||||
logger.info(converterAuditMarker, "Failed to convert {}: {}", crawledDocument.url, ex.getClass().getSimpleName());
|
||||
logger.warn(converterAuditMarker, "Failed to convert " + crawledDocument.url, ex);
|
||||
logger.warn(converterAuditMarker, "Failed to convert {}: {}", crawledDocument.url, ex.getClass().getSimpleName());
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void processDocument(CrawledDocument crawledDocument,
|
||||
DocumentClass documentClass,
|
||||
DocumentDecorator documentDecorator,
|
||||
DomainLinks externalDomainLinks,
|
||||
ProcessedDocument ret) throws URISyntaxException, IOException, DisqualifiedException
|
||||
{
|
||||
|
||||
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
|
||||
if (crawlerStatus != CrawlerDocumentStatus.OK) {
|
||||
throw new DisqualifiedException(crawlerStatus);
|
||||
}
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
if (!isAcceptedContentType(crawledDocument)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.CONTENT_TYPE);
|
||||
}
|
||||
|
||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||
|
||||
AbstractDocumentProcessorPlugin plugin = findPlugin(crawledDocument);
|
||||
|
||||
EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
LinkTexts linkTexts = anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, url);
|
||||
|
||||
AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument, linkTexts, documentClass);
|
||||
|
||||
ret.details = detailsWithWords.details();
|
||||
ret.words = detailsWithWords.words();
|
||||
|
||||
if (url.path.equals("/")) {
|
||||
ret.words.addMeta("special:root", WordFlags.Synthetic.asBit());
|
||||
}
|
||||
|
||||
documentDecorator.apply(ret);
|
||||
|
||||
if (Boolean.TRUE.equals(crawledDocument.hasCookies)
|
||||
&& ret.details != null
|
||||
&& ret.details.features != null)
|
||||
{
|
||||
ret.details.features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private AbstractDocumentProcessorPlugin findPlugin(CrawledDocument crawledDocument) throws DisqualifiedException {
|
||||
for (var plugin : processorPlugins) {
|
||||
if (plugin.isApplicable(crawledDocument))
|
||||
|
@@ -1,6 +1,9 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.StatusRuntimeException;
|
||||
import nu.marginalia.api.domsample.DomSampleClient;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
@@ -12,11 +15,14 @@ import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.domclassifier.DomSampleClassifier;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.geoip.sources.AsnTable;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
@@ -28,7 +34,11 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DomainProcessor {
|
||||
@@ -36,21 +46,29 @@ public class DomainProcessor {
|
||||
private final SiteWords siteWords;
|
||||
private final AnchorTagsSource anchorTagsSource;
|
||||
private final GeoIpDictionary geoIpDictionary;
|
||||
private final DomSampleClient domSampleClient;
|
||||
private final DomSampleClassifier domSampleClassifier;
|
||||
private final ExecutorService domSampleExecutor = Executors.newCachedThreadPool();
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final boolean hasDomSamples;
|
||||
|
||||
@Inject
|
||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||
SiteWords siteWords,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
GeoIpDictionary geoIpDictionary) throws SQLException
|
||||
{
|
||||
DomSampleClient domSampleClient,
|
||||
GeoIpDictionary geoIpDictionary,
|
||||
DomSampleClassifier domSampleClassifier) throws SQLException, InterruptedException {
|
||||
this.documentProcessor = documentProcessor;
|
||||
this.siteWords = siteWords;
|
||||
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
||||
this.geoIpDictionary = geoIpDictionary;
|
||||
this.domSampleClient = domSampleClient;
|
||||
this.domSampleClassifier = domSampleClassifier;
|
||||
|
||||
geoIpDictionary.waitReady();
|
||||
hasDomSamples = !Boolean.getBoolean("converter.ignoreDomSampleData") && domSampleClient.waitReady(Duration.ofSeconds(15));
|
||||
}
|
||||
|
||||
public SimpleProcessing simpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
|
||||
@@ -73,6 +91,27 @@ public class DomainProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
/** Fetch and process the DOM sample and extract classifications */
|
||||
private Set<DomSampleClassification> getDomainClassifications(String domainName) throws ExecutionException, InterruptedException {
|
||||
if (!hasDomSamples) {
|
||||
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
|
||||
}
|
||||
|
||||
return domSampleClient
|
||||
.getSampleAsync(domainName, domSampleExecutor)
|
||||
.thenApply(domSampleClassifier::classifySample)
|
||||
.handle((a,b) -> {
|
||||
if (b != null) {
|
||||
var cause = b.getCause();
|
||||
if (!(cause instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
|
||||
logger.warn("Exception when fetching sample data", b);
|
||||
}
|
||||
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
|
||||
}
|
||||
return a;
|
||||
}).get();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public ProcessedDomain fullProcessing(SerializableCrawlDataStream dataStream) {
|
||||
try {
|
||||
@@ -80,7 +119,6 @@ public class DomainProcessor {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<ProcessedDocument> docs = new ArrayList<>();
|
||||
Set<String> processedUrls = new HashSet<>();
|
||||
|
||||
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
|
||||
@@ -94,10 +132,12 @@ public class DomainProcessor {
|
||||
|
||||
ProcessedDomain ret = new ProcessedDomain();
|
||||
processDomain(crawledDomain, ret, documentDecorator);
|
||||
ret.documents = docs;
|
||||
ret.documents = new ArrayList<>();
|
||||
|
||||
// Process Documents
|
||||
|
||||
Set<DomSampleClassification> classifications = getDomainClassifications(crawledDomain.getDomain());
|
||||
|
||||
try (var deduplicator = new LshDocumentDeduplicator()) {
|
||||
while (dataStream.hasNext()) {
|
||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||
@@ -110,9 +150,23 @@ public class DomainProcessor {
|
||||
continue;
|
||||
|
||||
try {
|
||||
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
|
||||
deduplicator.markIfDuplicate(processedDoc);
|
||||
docs.add(processedDoc);
|
||||
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, classifications, documentDecorator);
|
||||
|
||||
if (deduplicator.isDocumentDuplicate(processedDoc)) {
|
||||
processedDoc.state = UrlIndexingState.DISQUALIFIED;
|
||||
processedDoc.stateReason = "Duplicate";
|
||||
}
|
||||
|
||||
if (processedDoc.isOk() && processedDoc.words != null && processedDoc.details != null) {
|
||||
classifications.forEach(classification -> {
|
||||
if (classification.htmlFeature == null) return;
|
||||
|
||||
processedDoc.words.addSyntheticTerm(classification.htmlFeature.getKeyword());
|
||||
processedDoc.details.features.add(classification.htmlFeature);
|
||||
});
|
||||
}
|
||||
|
||||
ret.documents.add(processedDoc);
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to process " + doc.url, ex);
|
||||
}
|
||||
@@ -142,15 +196,17 @@ public class DomainProcessor {
|
||||
private final DomainLinks externalDomainLinks;
|
||||
private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator();
|
||||
|
||||
Set<DomSampleClassification> classifications;
|
||||
|
||||
private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8,
|
||||
Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors())
|
||||
);
|
||||
|
||||
SimpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
|
||||
SimpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws Exception {
|
||||
this(dataStream, sizeHint, List.of());
|
||||
}
|
||||
|
||||
SimpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) throws IOException {
|
||||
SimpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) throws Exception {
|
||||
this.dataStream = dataStream;
|
||||
|
||||
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
|
||||
@@ -166,6 +222,8 @@ public class DomainProcessor {
|
||||
|
||||
processDomain(crawledDomain, domain, documentDecorator);
|
||||
|
||||
classifications = getDomainClassifications(crawledDomain.getDomain());
|
||||
|
||||
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
|
||||
}
|
||||
|
||||
@@ -187,16 +245,31 @@ public class DomainProcessor {
|
||||
|
||||
|
||||
taskConsumer.accept(() -> {
|
||||
var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator);
|
||||
var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, classifications, documentDecorator);
|
||||
|
||||
synchronized (deduplicator) {
|
||||
deduplicator.markIfDuplicate(processedDoc);
|
||||
if (deduplicator.isDocumentDuplicate(processedDoc)) {
|
||||
processedDoc.state = UrlIndexingState.DISQUALIFIED;
|
||||
processedDoc.stateReason = "Duplicate";
|
||||
}
|
||||
}
|
||||
|
||||
if (processedDoc.isProcessedFully()) {
|
||||
// This is a bit sketchy, but we need to set the size and topology to something
|
||||
processedDoc.details.metadata = processedDoc.details.metadata.withSizeAndTopology(
|
||||
10_000, externalDomainLinks.countForUrl(processedDoc.url));
|
||||
|
||||
// Apply classifications
|
||||
try {
|
||||
classifications.forEach(classification -> {
|
||||
if (classification.htmlFeature == null) return;
|
||||
|
||||
processedDoc.words.addSyntheticTerm(classification.htmlFeature.getKeyword());
|
||||
processedDoc.details.features.add(classification.htmlFeature);
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
}
|
||||
}
|
||||
|
||||
return processedDoc;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
package nu.marginalia.converting.processor.classifier;
|
||||
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.jsoup.nodes.Document;
|
@@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import crawlercommons.utils.Strings;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
@@ -14,6 +15,8 @@ import org.jsoup.select.NodeVisitor;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import static nu.marginalia.domclassifier.DomSampleClassification.*;
|
||||
|
||||
public class DocumentValuator {
|
||||
|
||||
public double getQuality(CrawledDocument crawledDocument,
|
||||
@@ -126,6 +129,25 @@ public class DocumentValuator {
|
||||
return quality + adjustment;
|
||||
}
|
||||
|
||||
public double getQuality(Set<DomSampleClassification> classifications) {
|
||||
double quality = 0;
|
||||
if (classifications.contains(ADS)) {
|
||||
quality -= 6;
|
||||
}
|
||||
if (classifications.contains(TRACKING)) {
|
||||
quality -= 4;
|
||||
}
|
||||
|
||||
if (classifications.contains(CONSENT)) {
|
||||
quality -= 4;
|
||||
}
|
||||
else if (classifications.contains(POPOVER)) {
|
||||
quality -= 4;
|
||||
}
|
||||
|
||||
return quality;
|
||||
}
|
||||
|
||||
public static class ScriptVisitor implements NodeVisitor {
|
||||
boolean hasBadScript = false;
|
||||
int scriptLength = 0;
|
||||
|
@@ -3,7 +3,6 @@ package nu.marginalia.converting.processor.logic;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator;
|
||||
import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.RecipeDetector;
|
||||
import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector;
|
||||
@@ -65,20 +64,17 @@ public class FeatureExtractor {
|
||||
"counter.yadro.ru"
|
||||
);
|
||||
|
||||
private final AdblockSimulator adblockSimulator;
|
||||
private final RecipeDetector recipeDetector;
|
||||
private final TextileCraftDetector textileCraftDetector;
|
||||
private final WoodworkingDetector woodworkingDetector;
|
||||
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
|
||||
|
||||
@Inject
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator,
|
||||
RecipeDetector recipeDetector,
|
||||
public FeatureExtractor(RecipeDetector recipeDetector,
|
||||
TextileCraftDetector textileCraftDetector,
|
||||
WoodworkingDetector woodworkingDetector,
|
||||
GoogleAnwersSpamDetector googleAnwersSpamDetector)
|
||||
{
|
||||
this.adblockSimulator = adblockSimulator;
|
||||
this.recipeDetector = recipeDetector;
|
||||
this.textileCraftDetector = textileCraftDetector;
|
||||
this.woodworkingDetector = woodworkingDetector;
|
||||
@@ -218,13 +214,6 @@ public class FeatureExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
if (features.contains(HtmlFeature.JS)
|
||||
// remove while disabled to get rid of expensive clone() call:
|
||||
// adblockSimulator.hasAds(doc.clone())
|
||||
) {
|
||||
features.add(HtmlFeature.ADVERTISEMENT);
|
||||
}
|
||||
|
||||
if (!doc.getElementsByTag("object").isEmpty()
|
||||
|| !doc.getElementsByTag("audio").isEmpty()
|
||||
|| !doc.getElementsByTag("video").isEmpty()) {
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.lsh.EasyLSH;
|
||||
|
||||
@@ -14,26 +13,25 @@ public class LshDocumentDeduplicator implements AutoCloseable {
|
||||
private final TLongArrayList hashCodes = new TLongArrayList(1000);
|
||||
private static final int DISTANCE_THRESHOLD = 2;
|
||||
|
||||
public void markIfDuplicate(ProcessedDocument document) {
|
||||
if (!document.isProcessedFully()) {
|
||||
return;
|
||||
}
|
||||
public boolean isDocumentDuplicate(ProcessedDocument document) {
|
||||
if (!document.isOk()) return false;
|
||||
if (document.words == null) return false;
|
||||
if (document.details == null) return false;
|
||||
|
||||
if (document.words.size() < 100) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
long hashCode = document.details.hashCode;
|
||||
|
||||
for (int i = 0; i < hashCodes.size(); i++) {
|
||||
if (EasyLSH.hammingDistance(hashCode, hashCodes.get(i)) < DISTANCE_THRESHOLD) {
|
||||
document.state = UrlIndexingState.DISQUALIFIED;
|
||||
document.stateReason = "Duplicate";
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
hashCodes.add(hashCode);
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
@@ -26,7 +27,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
||||
this.languageFilter = languageFilter;
|
||||
}
|
||||
|
||||
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, LinkTexts linkTexts, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException, IOException;
|
||||
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, LinkTexts linkTexts, Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException, IOException;
|
||||
public abstract boolean isApplicable(CrawledDocument doc);
|
||||
|
||||
protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
||||
|
@@ -6,15 +6,16 @@ import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.converting.processor.AcceptableAds;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.classifier.AcceptableAds;
|
||||
import nu.marginalia.converting.processor.logic.*;
|
||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecializations;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateSniffer;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.gregex.GuardedRegex;
|
||||
import nu.marginalia.gregex.GuardedRegexFactory;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
@@ -23,7 +24,6 @@ import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.link_parser.FeedExtractor;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@@ -62,12 +62,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private static final DocumentValuator documentValuator = new DocumentValuator();
|
||||
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
||||
|
||||
private static final int MAX_DOCUMENT_LENGTH_BYTES = Integer.getInteger("converter.max-body-length",128_000);
|
||||
private static boolean lenientProcessing = Boolean.getBoolean("converter.lenientProcessing");
|
||||
|
||||
@Inject
|
||||
@@ -106,7 +104,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
@Override
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||
LinkTexts linkTexts,
|
||||
DocumentClass documentClass)
|
||||
Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException, IOException {
|
||||
|
||||
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody(512))) {
|
||||
@@ -138,7 +136,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
final int length = getLength(doc);
|
||||
final DocumentFormat format = getDocumentFormat(doc);
|
||||
final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);
|
||||
final double quality;
|
||||
|
||||
if (domSampleClassifications.contains(DomSampleClassification.UNCLASSIFIED)) {
|
||||
quality = documentValuator.getQuality(crawledDocument, format, doc, length);
|
||||
}
|
||||
else {
|
||||
quality = documentValuator.getQuality(domSampleClassifications);
|
||||
}
|
||||
|
||||
if (!lenientProcessing && isDisqualified(documentClass, url, quality, doc.title())) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
@@ -148,10 +153,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
if (!lenientProcessing && !documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
ret.length = length;
|
||||
@@ -160,6 +161,11 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||
|
||||
if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
|
||||
features.add(HtmlFeature.SHORT_DOCUMENT);
|
||||
}
|
||||
|
||||
|
||||
ret.features = features;
|
||||
ret.quality = documentValuator.adjustQuality(quality, features);
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
|
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
@@ -77,7 +78,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
@Override
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||
LinkTexts linkTexts,
|
||||
DocumentClass documentClass)
|
||||
Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException, IOException {
|
||||
|
||||
String documentBody = crawledDocument.documentBody();
|
||||
@@ -114,7 +115,9 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
ret.quality = -5;
|
||||
|
||||
ret.features = Set.of(HtmlFeature.PDF);
|
||||
ret.features = new HashSet<>(); // must be mutable!
|
||||
ret.features.add(HtmlFeature.PDF);
|
||||
|
||||
ret.description = getDescription(doc);
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
|
||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.converting.processor.logic.PlainTextLogic;
|
||||
import nu.marginalia.converting.util.LineUtils;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
@@ -23,10 +24,7 @@ import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.LocalDate;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||
@@ -70,7 +68,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
@Override
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||
LinkTexts linkTexts,
|
||||
DocumentClass documentClass)
|
||||
Set<DomSampleClassification> domSampleClassifications, DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
String documentBody = crawledDocument.documentBody();
|
||||
|
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@@ -64,7 +65,7 @@ public class SideloaderProcessing {
|
||||
|
||||
var ret = new ProcessedDocument();
|
||||
try {
|
||||
var details = htmlProcessorPlugin.createDetails(crawledDoc, linkTexts, documentClass);
|
||||
var details = htmlProcessorPlugin.createDetails(crawledDoc, linkTexts, EnumSet.noneOf(DomSampleClassification.class), documentClass);
|
||||
|
||||
ret.words = details.words();
|
||||
|
||||
|
@@ -0,0 +1,16 @@
|
||||
package nu.marginalia.converting.processor.classifier.adblock;
|
||||
|
||||
import nu.marginalia.domclassifier.DomSampleClassifier;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
|
||||
class DomSampleClassifierTest {
|
||||
|
||||
@Test
|
||||
public void testLoadSpecs() throws ParserConfigurationException, IOException, SAXException {
|
||||
new DomSampleClassifier();
|
||||
}
|
||||
}
|
@@ -25,6 +25,7 @@ import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Instant;
|
||||
import java.util.Set;
|
||||
|
||||
@Tag("flaky")
|
||||
class PdfDocumentProcessorPluginTest {
|
||||
@@ -51,7 +52,7 @@ class PdfDocumentProcessorPluginTest {
|
||||
}
|
||||
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
|
||||
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||
return plugin.createDetails(doc, new LinkTexts(), Set.of(), DocumentClass.NORMAL);
|
||||
}
|
||||
|
||||
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(Path file) throws Exception {
|
||||
|
@@ -61,14 +61,14 @@ public class BackoffStrategy {
|
||||
};
|
||||
|
||||
double backoffMinutes = baseInterval.toMinutes()
|
||||
* Math.pow(multiplier, backoffConsecutiveFailures - 1);
|
||||
* Math.pow(multiplier, Math.clamp(backoffConsecutiveFailures, 1, 10));
|
||||
|
||||
Duration newDuration = Duration.ofMinutes(Math.round(0.5+backoffMinutes));
|
||||
if (newDuration.compareTo(maxInterval) > 0) {
|
||||
var backoffVal = Math.round(0.5+backoffMinutes);
|
||||
if (backoffVal > maxInterval.toMinutes()) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
return newDuration;
|
||||
return Duration.ofMinutes(backoffVal);
|
||||
}
|
||||
|
||||
private Duration addJitter(Duration duration) {
|
||||
|
@@ -44,6 +44,7 @@ dependencies {
|
||||
implementation project(':code:functions:favicon:api')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
implementation project(':code:processes:converting-process:ft-dom-classifier')
|
||||
|
||||
|
||||
implementation project(':code:index:api')
|
||||
|
@@ -132,46 +132,6 @@ public class SearchFrontPageService {
|
||||
return new IndexModel(items, refreshDateStr, searchVisitorCount.getQueriesPerMinute());
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* FIXME
|
||||
public Object renderNewsFeed(Request request, Response response) {
|
||||
List<NewsItem> newsItems = getNewsItems();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Marginalia Search News and Mentions</title>
|
||||
<link>https://search.marginalia.nu/</link>
|
||||
<description>News and Mentions of Marginalia Search</description>
|
||||
<language>en-us</language>
|
||||
<ttl>60</ttl>
|
||||
""");
|
||||
|
||||
sb.append("<lastBuildDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</lastBuildDate>\n");
|
||||
sb.append("<pubDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
|
||||
sb.append("<ttl>60</ttl>\n");
|
||||
for (var item : newsItems) {
|
||||
sb.append("<item>\n");
|
||||
sb.append("<title>").append(item.title()).append("</title>\n");
|
||||
sb.append("<link>").append(item.url()).append("</link>\n");
|
||||
if (item.source != null) {
|
||||
sb.append("<author>").append(item.source()).append("</author>\n");
|
||||
}
|
||||
sb.append("<pubDate>").append(item.date().atStartOfDay().atZone(ZoneId.systemDefault()).format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
|
||||
sb.append("</item>\n");
|
||||
}
|
||||
sb.append("</channel>\n");
|
||||
sb.append("</rss>\n");
|
||||
|
||||
response.type("application/rss+xml");
|
||||
|
||||
return sb.toString();
|
||||
}*/
|
||||
|
||||
public record IndexModel(List<NewsItemCluster> news,
|
||||
String refreshDate,
|
||||
int searchPerMinute) { }
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.MapModelAndView;
|
||||
@@ -9,12 +10,20 @@ import io.jooby.annotation.*;
|
||||
import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.domsample.DomSampleClient;
|
||||
import nu.marginalia.api.domsample.RpcDomainSampleRequests;
|
||||
import nu.marginalia.api.domsample.RpcOutgoingRequest;
|
||||
import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.api.feeds.RpcFeed;
|
||||
import nu.marginalia.api.feeds.RpcFeedItem;
|
||||
import nu.marginalia.api.livecapture.LiveCaptureClient;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.ddtrackergradar.DDGTrackerData;
|
||||
import nu.marginalia.ddtrackergradar.model.DDGTDomain;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.domclassifier.DomSampleClassifier;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.model.GroupedUrlDetails;
|
||||
@@ -26,6 +35,7 @@ import nu.marginalia.service.server.RateLimiter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
@@ -34,6 +44,9 @@ import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import static nu.marginalia.search.svc.SearchSiteInfoService.TrafficSample.*;
|
||||
|
||||
@Singleton
|
||||
public class SearchSiteInfoService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class);
|
||||
|
||||
@@ -43,13 +56,17 @@ public class SearchSiteInfoService {
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final FeedsClient feedsClient;
|
||||
private final LiveCaptureClient liveCaptureClient;
|
||||
private final DomSampleClient domSampleClient;
|
||||
private final ScreenshotService screenshotService;
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
private final DDGTrackerData ddgTrackerData;
|
||||
private final SearchSiteSubscriptionService searchSiteSubscriptions;
|
||||
|
||||
private final RateLimiter rateLimiter = RateLimiter.custom(60);
|
||||
|
||||
private final DomSampleClassifier domSampleClassifier;
|
||||
|
||||
@Inject
|
||||
public SearchSiteInfoService(SearchOperator searchOperator,
|
||||
DomainInfoClient domainInfoClient,
|
||||
@@ -59,6 +76,9 @@ public class SearchSiteInfoService {
|
||||
LiveCaptureClient liveCaptureClient,
|
||||
ScreenshotService screenshotService,
|
||||
HikariDataSource dataSource,
|
||||
DomSampleClient domSampleClient,
|
||||
DomSampleClassifier domSampleClassifier,
|
||||
DDGTrackerData ddgTrackerData,
|
||||
SearchSiteSubscriptionService searchSiteSubscriptions)
|
||||
{
|
||||
this.searchOperator = searchOperator;
|
||||
@@ -70,6 +90,9 @@ public class SearchSiteInfoService {
|
||||
this.liveCaptureClient = liveCaptureClient;
|
||||
this.screenshotService = screenshotService;
|
||||
this.dataSource = dataSource;
|
||||
this.domSampleClient = domSampleClient;
|
||||
this.domSampleClassifier = domSampleClassifier;
|
||||
this.ddgTrackerData = ddgTrackerData;
|
||||
this.searchSiteSubscriptions = searchSiteSubscriptions;
|
||||
|
||||
Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
|
||||
@@ -154,6 +177,7 @@ public class SearchSiteInfoService {
|
||||
case "links" -> listLinks(domainName, page);
|
||||
case "docs" -> listDocs(domainName, page);
|
||||
case "info" -> listInfo(context, domainName);
|
||||
case "traffic" -> listSiteRequests(context, domainName);
|
||||
case "report" -> reportSite(domainName);
|
||||
default -> listInfo(context, domainName);
|
||||
};
|
||||
@@ -239,6 +263,7 @@ public class SearchSiteInfoService {
|
||||
String url = "https://" + domainName + "/";
|
||||
|
||||
boolean hasScreenshot = screenshotService.hasScreenshot(domainId);
|
||||
|
||||
boolean isSubscribed = searchSiteSubscriptions.isSubscribed(context, domain);
|
||||
|
||||
boolean rateLimited = !rateLimiter.isAllowed();
|
||||
@@ -368,6 +393,91 @@ public class SearchSiteInfoService {
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
private SiteInfoModel listSiteRequests(Context context, String domainName) {
|
||||
if (!rateLimiter.isAllowed()) {
|
||||
return forServiceUnavailable(domainName);
|
||||
}
|
||||
|
||||
Optional<RpcDomainSampleRequests> sample = domSampleClient.getSampleRequests(domainName.toLowerCase());
|
||||
if (sample.isEmpty()) {
|
||||
return forNoData(domainName);
|
||||
}
|
||||
|
||||
final EdgeDomain currentDomain = new EdgeDomain(domainName);
|
||||
final List<RequestsForTargetDomain> requests = new ArrayList<>();
|
||||
final Map<EdgeDomain, List<Map.Entry<EdgeUrl, RpcOutgoingRequest>>> urlsPerDomain = new HashMap<>();
|
||||
|
||||
final Set<EdgeUrl> seenUrls = new HashSet<>();
|
||||
|
||||
for (RpcOutgoingRequest rpcOutgoingRequest : sample.get().getOutgoingRequestsList()) {
|
||||
Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rpcOutgoingRequest.getUrl());
|
||||
if (parsedUrl.isEmpty())
|
||||
continue;
|
||||
|
||||
final EdgeUrl url = parsedUrl.get();
|
||||
|
||||
if (url.domain.hasSameTopDomain(currentDomain))
|
||||
continue;
|
||||
if (!seenUrls.add(url))
|
||||
continue;
|
||||
|
||||
urlsPerDomain
|
||||
.computeIfAbsent(url.getDomain(), k -> new ArrayList<>())
|
||||
.add(Map.entry(url, rpcOutgoingRequest));
|
||||
}
|
||||
|
||||
Map<DomSampleClassification, Integer> requestSummary = new HashMap<>();
|
||||
|
||||
urlsPerDomain.forEach((requestDomain, urlsAndReqs) -> {
|
||||
final List<RequestEndpoint> endpoints = new ArrayList<>();
|
||||
|
||||
for (Map.Entry<EdgeUrl, RpcOutgoingRequest> urlAndReq : urlsAndReqs) {
|
||||
final EdgeUrl url = urlAndReq.getKey();
|
||||
final RpcOutgoingRequest outgoingRequest = urlAndReq.getValue();
|
||||
|
||||
final DomSampleClassification clazz = domSampleClassifier.classifyRequest(url);
|
||||
|
||||
requestSummary.merge(clazz, 1, Integer::sum);
|
||||
|
||||
endpoints.add(
|
||||
new RequestEndpoint(
|
||||
url.path + (url.param == null ? "" : "?" + url.param),
|
||||
outgoingRequest.getMethod().name(),
|
||||
clazz
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
final DDGTDomain trackerData =
|
||||
ddgTrackerData
|
||||
.getDomainInfo(requestDomain.toString())
|
||||
.orElse(null);
|
||||
|
||||
requests.add(
|
||||
new RequestsForTargetDomain(
|
||||
requestDomain,
|
||||
endpoints,
|
||||
trackerData
|
||||
)
|
||||
);
|
||||
});
|
||||
|
||||
requests.sort(Comparator
|
||||
.comparing((RequestsForTargetDomain req) -> req.endpoints.getFirst().classification.ordinal())
|
||||
.thenComparing(req -> req.ownerDisplayName() == null)
|
||||
.thenComparing(req -> req.domain.topDomain)
|
||||
.thenComparing(req -> req.domain.toString()));
|
||||
|
||||
return new TrafficSample(domainName, requestSummary, requests);
|
||||
}
|
||||
|
||||
|
||||
public interface SiteInfoModel {
|
||||
String domain();
|
||||
}
|
||||
|
||||
public record Docs(String domain,
|
||||
long domainId,
|
||||
List<UrlDetails> results,
|
||||
@@ -395,10 +505,6 @@ public class SearchSiteInfoService {
|
||||
}
|
||||
}
|
||||
|
||||
public interface SiteInfoModel {
|
||||
String domain();
|
||||
}
|
||||
|
||||
public record SiteInfoWithContext(String domain,
|
||||
boolean isSubscribed,
|
||||
List<DbDomainQueries.DomainWithNode> siblingDomains,
|
||||
@@ -492,4 +598,108 @@ public class SearchSiteInfoService {
|
||||
}
|
||||
}
|
||||
|
||||
public record TrafficSample(String domain,
|
||||
boolean hasData,
|
||||
boolean serviceAvailable,
|
||||
Map<DomSampleClassification, Integer> requestSummary,
|
||||
List<RequestsForTargetDomain> requests) implements SiteInfoModel {
|
||||
|
||||
public static String classificationIcon(DomSampleClassification clazz) {
|
||||
return switch (clazz) {
|
||||
case ADS -> "fa-ad";
|
||||
case TRACKING -> "fa-crosshairs";
|
||||
case CONSENT -> "fa-shield-alt";
|
||||
default -> "";
|
||||
};
|
||||
}
|
||||
|
||||
public static String classificationColor(DomSampleClassification clazz) {
|
||||
return switch (clazz) {
|
||||
case ADS -> "bg-red-100 text-red-800 dark:bg-red-900 dark:text-white dark:border dark:border-red-400";
|
||||
case TRACKING -> "bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-white dark:border dark:border-purple-400";
|
||||
case CONSENT -> "bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-white dark:border dark:border-yellow-400";
|
||||
default -> "";
|
||||
};
|
||||
}
|
||||
|
||||
public static String categoryColor(String category) {
|
||||
return switch (category) {
|
||||
case "Ad Motivated Tracking", "Tracking", "Advertising", "Third-Party Analytics Marketing", "Action Pixels", "Badge" -> "bg-red-100 text-red-800 dark:bg-red-900 dark:text-white dark:border dark:border-red-400";
|
||||
case "CDN", "Fraud Prevention", "Online Payment", "Consent Management Platform", "SSO" -> "bg-green-100 text-green-800 dark:bg-green-900 dark:text-white dark:border dark:border-green-400";
|
||||
case "Social - Comment", "Social - Share", "Social Network", "Federated Login" -> "bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-white dark:border dark:border-yellow-400";
|
||||
case "Session Replay", "Audience Measurement", "Analytics", "Tag Manager" -> "bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-white dark:border dark:border-purple-400";
|
||||
case "Malware", "Ad Fraud", "Unknown High Risk Behavior", "Obscure Ownership" -> "bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-200 dark:border dark:border-blue-400";
|
||||
default -> "bg-gray-200 text-gray-800 dark:bg-gray-600 dark:text-gray-200 dark:border dark:border-gray-200";
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
public TrafficSample(String domain,
|
||||
Map<DomSampleClassification, Integer> requestSummary,
|
||||
List<RequestsForTargetDomain> requests
|
||||
) {
|
||||
this(domain, true, true, requestSummary, requests);
|
||||
}
|
||||
|
||||
static TrafficSample forNoData(String domain) {
|
||||
return new TrafficSample(domain, false, true, Map.of(), List.of());
|
||||
}
|
||||
|
||||
static TrafficSample forServiceUnavailable(String domain) {
|
||||
return new TrafficSample(domain, true, false, Map.of(), List.of());
|
||||
}
|
||||
|
||||
|
||||
public record RequestEndpoint(String path,
|
||||
String method,
|
||||
DomSampleClassification classification) {
|
||||
|
||||
}
|
||||
|
||||
public record RequestsForTargetDomain(EdgeDomain domain, List<RequestEndpoint> endpoints, @Nullable DDGTDomain ddgtTrackerInfo)
|
||||
{
|
||||
public List<String> ownerCategories() {
|
||||
if (ddgtTrackerInfo == null) return List.of();
|
||||
if (ddgtTrackerInfo.categories() == null) return List.of();
|
||||
return ddgtTrackerInfo.categories();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public String ownerName() {
|
||||
if (ddgtTrackerInfo == null)
|
||||
return null;
|
||||
if (ddgtTrackerInfo.owner() == null)
|
||||
return null;
|
||||
return ddgtTrackerInfo.owner().name();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public String ownerDisplayName() {
|
||||
if (ddgtTrackerInfo == null)
|
||||
return null;
|
||||
if (ddgtTrackerInfo.owner() == null)
|
||||
return null;
|
||||
return ddgtTrackerInfo.owner().displayName();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public String ownerUrl() {
|
||||
if (ddgtTrackerInfo == null)
|
||||
return null;
|
||||
if (ddgtTrackerInfo.owner() == null)
|
||||
return null;
|
||||
return ddgtTrackerInfo.owner().url();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public String ownerPolicy() {
|
||||
if (ddgtTrackerInfo == null)
|
||||
return null;
|
||||
if (ddgtTrackerInfo.owner() == null)
|
||||
return null;
|
||||
return ddgtTrackerInfo.owner().privacyPolicy();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -14,3 +14,20 @@ as we sometimes generate classes from Java code or javascript!
|
||||
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full">
|
||||
</div>
|
||||
</label>
|
||||
|
||||
// case ADS -> "fa-ad";
|
||||
case TRACKING -> "fa-crosshairs";
|
||||
case CONSENT -> "fa-shield-alt";
|
||||
default -> "";
|
||||
};
|
||||
}
|
||||
|
||||
<i class="bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200 dark:border dark:border-red-400"></i>
|
||||
<i class="bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200 dark:border dark:border-purple-400"></i>
|
||||
<i class="bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-200 dark:border dark:border-yellow-400"></i>
|
||||
|
||||
<i class="bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200 dark:border dark:border-red-400"></i>
|
||||
<i class="bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200 dark:border dark:border-green-400"></i>
|
||||
<i class="bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200 dark:border dark:border-purple-400"></i>
|
||||
<i class="bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-200 dark:border dark:border-blue-400"></i>
|
||||
<i class="bg-gray-200 text-gray-800 dark:bg-gray-600 dark:text-gray-200 dark:border dark:border-gray-200"></i>
|
||||
|
@@ -38,35 +38,41 @@
|
||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||
</div>
|
||||
</div>
|
||||
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3">
|
||||
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2 items-center sm:items-stretch">
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
|
||||
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
|
||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<li>Prioritizes non-commercial content</li>
|
||||
<li>Tools for both search and discovery</li>
|
||||
<li>Find lost old websites</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
|
||||
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
|
||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<li>Custom index and crawler software</li>
|
||||
<li>Simple technology -- no AI or cloud</li>
|
||||
<li>Simple technology, no AI</li>
|
||||
<li>AGPL license</li>
|
||||
</ul>
|
||||
<div class="text-xs text-liteblue dark:text-blue-200 pt-4">
|
||||
<i class="fas fa-link"></i>
|
||||
<div class="flex pt-4 gap-2">
|
||||
<div class="text-xs text-liteblue dark:text-blue-200">
|
||||
<i class="fa-brands fa-github"></i>
|
||||
<a href="https://git.marginalia.nu/" class="underline">Git Repository</a>
|
||||
</div>
|
||||
<div class="text-xs text-liteblue dark:text-blue-200">
|
||||
<i class="fa-brands fa-discord"></i>
|
||||
<a href="https://discord.gg/GgpkrVbF" class="underline">Project Discord</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
|
||||
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
|
||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<li>Filter out tracking and adtech</li>
|
||||
<li>No user or search data shared with 3rd parties</li>
|
||||
<li>No long-term retention of queries or IP addresses</li>
|
||||
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<li>Filter out tracking </li>
|
||||
<li>No data shared with 3rd parties</li>
|
||||
<li>No long-term retention of IPs</li>
|
||||
</ul>
|
||||
<div class="text-xs text-liteblue dark:text-blue-200 pt-4">
|
||||
<i class="fas fa-link"></i>
|
||||
|
@@ -16,7 +16,7 @@
|
||||
<header class="border-b border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md">
|
||||
<div class="max-w-[1400px] mx-auto p-4">
|
||||
<div class="flex place-items-baseline space-x-2">
|
||||
<span class="text-gray-900 dark:text-white text-md font-mono rounded-sm block p-2.5">
|
||||
<span class="text-gray-900 dark:text-white break-none text-sm sm:text-md font-mono rounded-sm block p-2.5">
|
||||
${model.domain()}
|
||||
</span>
|
||||
<span class="grow"></span>
|
||||
@@ -57,8 +57,8 @@
|
||||
</div>
|
||||
<div class="mx-auto md:px-4 border dark:border-gray-600 bg-slate-50 dark:bg-gray-600">
|
||||
<div class="flex md:space-x-2 max-w-[1000px] mx-auto">
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 sm:px-2 px-1">
|
||||
<a href="?view=info" class="whitespace-nowrap place-items-baseline space-x-1 text-gray-700 dark:text-white text-sm hover:text-gray-900 dark:hover:text-gray-200">
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 px-2">
|
||||
<a href="?view=info" class="whitespace-nowrap place-items-baseline space-x-1 text-gray-700 dark:text-white text-xs sm:text-sm hover:text-gray-900 dark:hover:text-gray-200">
|
||||
@if (model instanceof SearchSiteInfoService.SiteInfoWithContext)
|
||||
<input type="checkbox" class="sr-only hidden " checked readonly />
|
||||
@else
|
||||
@@ -71,8 +71,8 @@
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 sm:px-2 px-1">
|
||||
<a href="?view=docs" class="whitespace-nowrap place-items-baseline space-x-1 text-gray-700 dark:text-white text-sm hover:text-gray-900 dark:hover:text-gray-200">
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 px-2">
|
||||
<a href="?view=docs" class="whitespace-nowrap place-items-baseline space-x-1 text-gray-700 dark:text-white text-xs sm:text-sm hover:text-gray-900 dark:hover:text-gray-200">
|
||||
@if (model instanceof SearchSiteInfoService.Docs)
|
||||
<input type="checkbox" class="sr-only hidden absolute" checked readonly />
|
||||
@else
|
||||
@@ -81,12 +81,13 @@
|
||||
|
||||
<i class="fa-regular fa-file"></i>
|
||||
|
||||
<span>Documents</span>
|
||||
<span class="hidden sm:inline">Documents</span>
|
||||
<span class="inline sm:hidden">Docs</span>
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 sm:px-2 px-1">
|
||||
<a href="?view=links" class="whitespace-nowrap place-items-baseline space-x-1 text-gray-700 dark:text-white text-sm hover:text-gray-900 dark:hover:text-gray-200">
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 px-2">
|
||||
<a href="?view=links" class="whitespace-nowrap place-items-baseline space-x-1 text-gray-700 dark:text-white text-xs sm:text-sm hover:text-gray-900 dark:hover:text-gray-200">
|
||||
@if (model instanceof SearchSiteInfoService.Backlinks)
|
||||
<input type="checkbox" class="sr-only hidden absolute" checked readonly />
|
||||
@else
|
||||
@@ -95,12 +96,27 @@
|
||||
|
||||
<i class="fas fa-link"></i>
|
||||
|
||||
<span>Backlinks</span>
|
||||
<span class="hidden sm:inline">Backlinks</span>
|
||||
<span class="inline sm:hidden">Links</span>
|
||||
</a>
|
||||
</div>
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 px-2">
|
||||
<a href="?view=traffic" class="whitespace-nowrap place-items-baseline space-x-1 text-gray-700 dark:text-white text-xs sm:text-sm hover:text-gray-900 dark:hover:text-gray-200">
|
||||
@if (model instanceof SearchSiteInfoService.TrafficSample)
|
||||
<input type="checkbox" class="sr-only hidden absolute" checked readonly />
|
||||
@else
|
||||
<span></span>
|
||||
@endif
|
||||
|
||||
<i class="fas fa-crosshairs"></i>
|
||||
|
||||
<span class="hidden sm:inline">Requests</span>
|
||||
<span class="inline sm:hidden">Reqs</span>
|
||||
</a>
|
||||
</div>
|
||||
<div class="grow"></div>
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 sm:px-2 px-1">
|
||||
<a href="?view=report" class="text-sm whitespace-nowrap place-items-baseline space-x-1 text-red-800 dark:text-red-200 text-sm hover:text-red-600 dark:hover:text-red-300">
|
||||
<div class="has-[:checked]:bg-slate-200 dark:has-[:checked]:bg-slate-800 py-1 px-2">
|
||||
<a href="?view=report" class="text-sm whitespace-nowrap place-items-baseline space-x-1 text-red-800 dark:text-red-200 text-xs sm:text-sm hover:text-red-600 dark:hover:text-red-300">
|
||||
@if (model instanceof SearchSiteInfoService.ReportDomain)
|
||||
<input type="checkbox" class="sr-only hidden absolute" checked readonly />
|
||||
@else
|
||||
@@ -126,6 +142,8 @@
|
||||
@template.siteinfo.view.backlinks(backlinks = backlinks)
|
||||
@elseif (model instanceof SearchSiteInfoService.Docs docs)
|
||||
@template.siteinfo.view.docs(docs = docs)
|
||||
@elseif (model instanceof SearchSiteInfoService.TrafficSample report)
|
||||
@template.siteinfo.view.traffic(report = report)
|
||||
@endif
|
||||
|
||||
</div>
|
||||
|
@@ -148,7 +148,6 @@
|
||||
</form>
|
||||
@endif
|
||||
|
||||
|
||||
@if (!siteInfo.siblingDomains().isEmpty())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fas fa-globe"></i>
|
||||
|
@@ -0,0 +1,179 @@
|
||||
@import nu.marginalia.domclassifier.DomSampleClassification
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService.*
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService.TrafficSample.RequestsForTargetDomain
|
||||
|
||||
@param TrafficSample report
|
||||
|
||||
<!-- Main content -->
|
||||
|
||||
<div class="flex flex-col space-y-2 w-full">
|
||||
<div class="flex flex-col space-y-4 my-4">
|
||||
@if (!report.serviceAvailable())
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white overflow-hidden mx-2 text-gray-800 text-sm">
|
||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<span>Third-Party Requests</span>
|
||||
</div>
|
||||
<div class="p-4">
|
||||
This service is currently being relentlessly scraped by bots and access
|
||||
is disabled until they give up.
|
||||
</div>
|
||||
</div>
|
||||
@elseif (!report.hasData())
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white overflow-hidden mx-2 text-gray-800 text-sm">
|
||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<span>Third-Party Requests</span>
|
||||
</div>
|
||||
<div class="p-4">
|
||||
The database of third party requests is still being assembled, and the
|
||||
search engine doesn't yet have any information about <span class="inline font-mono text-pink-800 dark:text-pink-200">${report.domain()}</span>.
|
||||
<p class="mt-4"></p>
|
||||
Be patient. Several million websites need to be visited and assessed,
|
||||
each visit taking up to 30 seconds. At the current rate, it is expected
|
||||
the full database will be complete around the end of 2025, or early 2026.
|
||||
</div>
|
||||
</div>
|
||||
@else
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white overflow-hidden mx-2 text-gray-800 text-sm">
|
||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<span>Third-Party Requests</span>
|
||||
</div>
|
||||
<div class="p-4">
|
||||
To better understand what <span class="inline font-mono text-pink-800 dark:text-pink-200">${report.domain()}</span> is doing
|
||||
in the background as you visit the website, the search engine records which third-party servers it talks to.
|
||||
<p class="mt-2"></p>
|
||||
To help make sense of the recorded network traffic, the report is supplemented with information from
|
||||
<a href="https://github.com/duckduckgo/tracker-radar/" class="text-blue-800 dark:text-blue-200 underline" rel="external">DuckDuckGo's Tracker Radar</a>,
|
||||
subject to the CC BY-NC-SA 4.0 license.
|
||||
<details class="mt-2">
|
||||
<summary class="text-gray-600 hover:text-gray-700 dark:text-gray-400 hover:dark:text-gray-300 cursor-pointer select-none">
|
||||
Learn More
|
||||
</summary>
|
||||
<p class="mt-2">
|
||||
The search engine classifies third party requests into four buckets, based on their apparent purpose.
|
||||
</p>
|
||||
<p class="mt-2">
|
||||
<span class="text-red-600 dark:text-red-400"><i class="fa fa-ad"></i> Advertisement</span> requests are involved in the bidding or display of advertisements, or the tracking
|
||||
of ad impressions. They do not guarantee ads will be present on the website, as the advertisement
|
||||
broker may decide it's not economic to place an ad for any particular visitor, but it is on the other hand virtually
|
||||
impossible for ads to be present if this type of activity is not found.
|
||||
</p>
|
||||
<p class="mt-2">
|
||||
<span class="text-purple-600 dark:text-purple-400"><i class="fa fa-crosshairs"></i> Tracking</span> requests analyze user behavior on the web, sometimes with the purpose of building a profile
|
||||
for advertisement using cookies or browser fingerprinting technologies, other times the traffic exists only to help understand what visitors are doing on a website
|
||||
for the benefit of the webmasters.
|
||||
</p>
|
||||
<p class="mt-2">
|
||||
<span class="text-orange-600 dark:text-orange-400"><i class="fa fa-shield-alt"></i> Consent</span> requests manage GDPR or cookie consent popups, and similar nuisances.
|
||||
In general, tracking and advertisement scripts are not run until a consent popup is dismissed. The system will try to automatically
|
||||
agree to tracking consent popups when it can identify them in order to also capture these deferred requests, but this is not always successful,
|
||||
so the presence of consent requests alone is a weak indicator a website may intend to load tracking or advertisement scripts.
|
||||
</p>
|
||||
<p class="mt-2">
|
||||
<span class="text-gray-600 dark:text-gray-400"><i class="fa fa-question-circle"></i> Unclassified</span> requests are requests the system doesn't know what they are. Often these are
|
||||
requests to content-delivery networks intended to reduce the network traffic to the server hosting the website and speed up page loads.
|
||||
</p>
|
||||
|
||||
<p class="mt-2"></p>
|
||||
This data is continuously updated, but updates are fairly
|
||||
slow so the information may not be fully up to date.
|
||||
</details>
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
|
||||
</div>
|
||||
|
||||
@if (report.hasData())
|
||||
<div class="mx-2">
|
||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white rounded border mb-4">
|
||||
<span>Summary</span>
|
||||
</div>
|
||||
<!-- Summary Stats -->
|
||||
<div class="grid grid-cols-4 gap-4 mb-8">
|
||||
<div class="bg-white rounded p-4 shadow-sm border dark:bg-gray-800 dark:border-gray-600 place-items-center">
|
||||
<div class="text-2xl font-bold text-red-600 dark:text-red-400">${report.requestSummary().getOrDefault(DomSampleClassification.ADS, 0)}</div>
|
||||
<div class="text-sm text-gray-600 dark:text-gray-400">Ads</div>
|
||||
</div>
|
||||
<div class="bg-white rounded p-4 shadow-sm border dark:bg-gray-800 dark:border-gray-600 place-items-center">
|
||||
<div class="text-2xl font-bold text-purple-600 dark:text-purple-400">${report.requestSummary().getOrDefault(DomSampleClassification.TRACKING, 0)}</div>
|
||||
<div class="text-sm text-gray-600 dark:text-gray-400">Tracking</div>
|
||||
</div>
|
||||
<div class="bg-white rounded p-4 shadow-sm border dark:bg-gray-800 dark:border-gray-600 place-items-center">
|
||||
<div class="text-2xl font-bold text-orange-600 dark:text-orange-400">${report.requestSummary().getOrDefault(DomSampleClassification.CONSENT, 0)}</div>
|
||||
<div class="text-sm text-gray-600 dark:text-gray-400">Consent</div>
|
||||
</div>
|
||||
<div class="bg-white rounded p-4 shadow-sm border dark:bg-gray-800 dark:border-gray-600 place-items-center">
|
||||
<div class="text-2xl font-bold text-gray-600 dark:text-gray-400">${report.requestSummary().getOrDefault(DomSampleClassification.UNCLASSIFIED, 0)}</div>
|
||||
<div class="text-sm text-gray-600 dark:text-gray-400">Other</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<!-- Domain Groups -->
|
||||
<div class="space-y-4 mx-2">
|
||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white rounded border">
|
||||
<span>Breakdown</span>
|
||||
</div>
|
||||
@if (report.requests().isEmpty())
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-2 text-gray-800 text-sm">
|
||||
No third-party requests were made!
|
||||
</div>
|
||||
@endif
|
||||
@for (RequestsForTargetDomain request : report.requests())
|
||||
<!-- Google Analytics Domain -->
|
||||
<div class="bg-white rounded shadow-sm border border-gray-200 dark:bg-gray-800 dark:border-gray-600">
|
||||
<div class="p-2 md:p-6 border-b border-gray-100 dark:border-gray-600">
|
||||
<div class="flex items-start justify-between flex-col md:flex-row gap-2">
|
||||
<div class="flex-1">
|
||||
|
||||
<h3 class="text-lg font-semibold dark:text-gray-100 text-gray-900 font-mono">${request.domain().toString()}</h3>
|
||||
|
||||
@if (request.ownerDisplayName() != null)
|
||||
<p class="text-sm text-gray-600 dark:text-gray-400 mt-1">${request.ownerDisplayName()}</p>
|
||||
@elseif (request.ownerName() != null)
|
||||
<p class="text-sm text-gray-600 dark:text-gray-400 mt-1">${request.ownerName()}</p>
|
||||
@endif
|
||||
<div class="flex items-center gap-4 mt-3">
|
||||
@if (request.ownerUrl() != null)
|
||||
<a href="${request.ownerUrl()}" rel="external nofollow" class="text-blue-600 dark:text-blue-200 text-sm flex flex-row place-items-baseline gap-1">
|
||||
<i class="fas fa-external-link-alt text-xs"></i> Visit Site
|
||||
</a>
|
||||
@endif
|
||||
@if (request.ownerPolicy() != null)
|
||||
<a href="${request.ownerPolicy()}" rel="external nofollow" class="text-blue-600 dark:text-blue-200 text-sm flex flex-row place-items-baseline gap-1">
|
||||
<i class="fas fa-shield-alt text-xs"></i> Privacy Policy
|
||||
</a>
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex flex-wrap justify-end gap-2 md:ml-2">
|
||||
@for (String tag : request.ownerCategories())
|
||||
<span class="px-2 py-1 ${TrafficSample.categoryColor(tag)} text-xs rounded">${tag}</span>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="p-4">
|
||||
<div class="space-y-3">
|
||||
@for (var req : request.endpoints())
|
||||
<div class="flex items-center justify-between py-2 px-3 bg-gray-100 dark:bg-gray-600 rounded-lg">
|
||||
<div class="flex items-center gap-3">
|
||||
<div class="text-xs text-gray-500 dark:text-gray-100 font-mono">${req.method()}</div>
|
||||
<span class="text-sm text-gray-600 dark:text-white font-mono break-all">${req.path()}</span>
|
||||
</div>
|
||||
@if (req.classification() != DomSampleClassification.UNCLASSIFIED)
|
||||
<span class="px-2 py-1 bg-orange-100 text-orange-800 text-xs rounded flex flex-row place-items-baseline gap-1 ${TrafficSample.classificationColor(req.classification())}">
|
||||
<i class="fa ${TrafficSample.classificationIcon(req.classification())}"></i> ${req.classification().name()}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
@@ -87,6 +87,7 @@ public class JtePaperDoll {
|
||||
"results", ret)
|
||||
)
|
||||
);
|
||||
|
||||
Spark.get("/site-info",
|
||||
(rq, rs) -> {
|
||||
if ("links".equals(rq.queryParams("view"))) {
|
||||
@@ -98,6 +99,9 @@ public class JtePaperDoll {
|
||||
else if ("report".equals(rq.queryParams("view"))) {
|
||||
return MockedSearchResults.mockReportDomain();
|
||||
}
|
||||
else if ("traffic".equals(rq.queryParams("view"))) {
|
||||
return MockedSearchResults.mockTrafficReport();
|
||||
}
|
||||
else return MockedSearchResults.mockSiteInfoData();
|
||||
|
||||
},
|
||||
|
@@ -7,6 +7,9 @@ import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.browse.model.BrowseResultSet;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.ddtrackergradar.model.DDGTDomain;
|
||||
import nu.marginalia.ddtrackergradar.model.DDGTOwner;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@@ -19,6 +22,7 @@ import nu.marginalia.search.svc.SearchSiteInfoService;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
public class MockedSearchResults {
|
||||
@@ -271,4 +275,49 @@ public class MockedSearchResults {
|
||||
List.of(mockUrlDetails("https://www.example.com/some-incredibly-long-address-that-goes-on-and-on", "One document")),
|
||||
List.of(mockUrlDetails("https://other.example.com/", "Other document")));
|
||||
}
|
||||
|
||||
public static Object mockTrafficReport() {
|
||||
List<SearchSiteInfoService.TrafficSample.RequestsForTargetDomain> requests = new ArrayList<>();
|
||||
requests.add(new SearchSiteInfoService.TrafficSample.RequestsForTargetDomain(
|
||||
new EdgeDomain("hotjar.com"),
|
||||
List.of(new SearchSiteInfoService.TrafficSample.RequestEndpoint("/foo.js", "POST", DomSampleClassification.TRACKING)),
|
||||
new DDGTDomain(
|
||||
"hotjar.com",
|
||||
new DDGTOwner("Hotjar Ltd", "Hotjar", "https://www.example.com/", "https://www.hotjar.com/"),
|
||||
List.of("Tracking", "Session Replay"),
|
||||
List.of()
|
||||
)
|
||||
));
|
||||
requests.add(new SearchSiteInfoService.TrafficSample.RequestsForTargetDomain(
|
||||
new EdgeDomain("doubleclick.net"),
|
||||
List.of(new SearchSiteInfoService.TrafficSample.RequestEndpoint("/foo.js", "GET", DomSampleClassification.TRACKING),
|
||||
new SearchSiteInfoService.TrafficSample.RequestEndpoint("/bar.js", "GET", DomSampleClassification.TRACKING)),
|
||||
new DDGTDomain(
|
||||
"doubleclick.net",
|
||||
new DDGTOwner("Doubleclick Inc", "Doubleclick", "https://www.example.com/", "https://www.hotjar.com/"),
|
||||
List.of("CDN", "Advertising"),
|
||||
List.of()
|
||||
)
|
||||
));
|
||||
requests.add(new SearchSiteInfoService.TrafficSample.RequestsForTargetDomain(
|
||||
new EdgeDomain("sketchy.org"),
|
||||
List.of(new SearchSiteInfoService.TrafficSample.RequestEndpoint("/foo.js", "GET", DomSampleClassification.ADS),
|
||||
new SearchSiteInfoService.TrafficSample.RequestEndpoint("/bar.js", "GET", DomSampleClassification.CONSENT)),
|
||||
new DDGTDomain(
|
||||
"sketchy.org",
|
||||
new DDGTOwner("Doubious AB", "Legit Enterprises", "https://www.example.com/", "https://www.hotjar.com/"),
|
||||
List.of("Malware", "Social - Comment"),
|
||||
List.of()
|
||||
)
|
||||
));
|
||||
return new SearchSiteInfoService.TrafficSample(
|
||||
"example.com",
|
||||
Map.of(
|
||||
DomSampleClassification.ADS, 3,
|
||||
DomSampleClassification.TRACKING, 10
|
||||
),
|
||||
requests
|
||||
);
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -5,6 +5,7 @@ import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import nu.marginalia.assistant.suggest.Suggestions;
|
||||
import nu.marginalia.domsample.DomSampleGrpcService;
|
||||
import nu.marginalia.domsample.DomSampleService;
|
||||
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
||||
import nu.marginalia.functions.math.MathGrpcService;
|
||||
@@ -22,7 +23,6 @@ import java.util.List;
|
||||
public class AssistantService extends JoobyService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = GsonFactory.get();
|
||||
@org.jetbrains.annotations.NotNull
|
||||
private final ScreenshotService screenshotService;
|
||||
private final Suggestions suggestions;
|
||||
|
||||
@@ -32,6 +32,7 @@ public class AssistantService extends JoobyService {
|
||||
DomainInfoGrpcService domainInfoGrpcService,
|
||||
LiveCaptureGrpcService liveCaptureGrpcService,
|
||||
DomSampleService domSampleService,
|
||||
DomSampleGrpcService domSampleGrpcService,
|
||||
FeedsGrpcService feedsGrpcService,
|
||||
MathGrpcService mathGrpcService,
|
||||
Suggestions suggestions)
|
||||
@@ -41,7 +42,9 @@ public class AssistantService extends JoobyService {
|
||||
List.of(domainInfoGrpcService,
|
||||
mathGrpcService,
|
||||
liveCaptureGrpcService,
|
||||
feedsGrpcService),
|
||||
feedsGrpcService,
|
||||
domSampleGrpcService
|
||||
),
|
||||
List.of());
|
||||
|
||||
this.screenshotService = screenshotService;
|
||||
|
@@ -36,6 +36,7 @@ dependencies {
|
||||
implementation project(':code:common:linkdb')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.bundles.grpc
|
||||
|
@@ -9,6 +9,7 @@ import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.domsample.DomSampleClient;
|
||||
import nu.marginalia.db.DomainTypes;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
@@ -37,6 +38,7 @@ import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
|
||||
import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME;
|
||||
@@ -85,6 +87,9 @@ public class IntegrationTestModule extends AbstractModule {
|
||||
bind(FileStorageService.class).toInstance(fileStorageServiceMock);
|
||||
bind(ServiceHeartbeat.class).toInstance(new FakeServiceHeartbeat());
|
||||
bind(ProcessHeartbeat.class).toInstance(new FakeProcessHeartbeat());
|
||||
DomSampleClient domSampleClientMock = Mockito.mock(DomSampleClient.class);
|
||||
when(domSampleClientMock.getSampleAsync(any(), any())).thenReturn(CompletableFuture.failedFuture(new RuntimeException()));
|
||||
bind(DomSampleClient.class).toInstance(domSampleClientMock);
|
||||
|
||||
SearchSetsService setsServiceMock = Mockito.mock(SearchSetsService.class);
|
||||
when(setsServiceMock.getSearchSetByName("NONE")).thenReturn(new SearchSetAny());
|
||||
|
@@ -11,3 +11,4 @@
|
||||
2025-05-17: Redeploy all.
|
||||
2025-05-28: Deploy assistant and browserless.
|
||||
2025-06-06: Deploy assistant and browserless.
|
||||
2025-07-21: Deploy executor partition 1.
|
@@ -83,6 +83,7 @@ include 'code:common:renderer'
|
||||
include 'code:processes:converting-process'
|
||||
include 'code:processes:converting-process:model'
|
||||
include 'code:processes:converting-process:ft-keyword-extraction'
|
||||
include 'code:processes:converting-process:ft-dom-classifier'
|
||||
|
||||
include 'code:processes:crawling-process'
|
||||
include 'code:processes:crawling-process:model'
|
||||
@@ -159,7 +160,7 @@ dependencyResolutionManagement {
|
||||
library('protobuf', 'com.google.protobuf', 'protobuf-java').version('3.16.3')
|
||||
library('grpc-protobuf', 'io.grpc', 'grpc-protobuf').version('1.73.0')
|
||||
library('grpc-stub', 'io.grpc', 'grpc-stub').version('1.73.0')
|
||||
library('grpc-netty', 'io.grpc', 'grpc-netty-shaded').version('1.73.0')
|
||||
library('grpc-netty', 'io.grpc', 'grpc-netty').version('1.73.0')
|
||||
|
||||
library('prometheus', 'io.prometheus', 'simpleclient').version('0.16.0')
|
||||
library('prometheus-servlet', 'io.prometheus', 'simpleclient_servlet').version('0.16.0')
|
||||
|
Reference in New Issue
Block a user