1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

4 Commits

Author SHA1 Message Date
Viktor Lofgren
a771a5b6ce (sample) Test different approach to decoding 2025-07-21 14:19:01 +02:00
Viktor Lofgren
dac5b54128 (sample) Better logging for sample errors 2025-07-21 14:03:58 +02:00
Viktor Lofgren
6cfb143c15 (sample) Compress sample HTML data and introduce new API for only getting requests 2025-07-21 13:55:25 +02:00
Viktor Lofgren
23c818281b (converter) Reduce DomSample logging for NOT_FOUND 2025-07-21 13:37:55 +02:00
9 changed files with 98 additions and 11 deletions

View File

@@ -47,6 +47,21 @@ public class DomSampleClient {
} }
} }
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
try {
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
return Optional.of(val);
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}
}
public boolean hasSample(String domainName) { public boolean hasSample(String domainName) {
try { try {
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample) return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)

View File

@@ -7,6 +7,7 @@ option java_multiple_files=true;
service DomSampleApi { service DomSampleApi {
rpc getSample(RpcDomainName) returns (RpcDomainSample) {} rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {} rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {} rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
} }
@@ -19,10 +20,16 @@ message RpcBooleanRsp {
bool answer = 1; bool answer = 1;
} }
message RpcDomainSampleRequests {
string domainName = 1;
string url = 2;
repeated RpcOutgoingRequest outgoingRequests = 5;
}
message RpcDomainSample { message RpcDomainSample {
string domainName = 1; string domainName = 1;
string url = 2; string url = 2;
string htmlSample = 3; bytes htmlSampleZstd = 3;
bool accepted_popover = 4; bool accepted_popover = 4;
repeated RpcOutgoingRequest outgoingRequests = 5; repeated RpcOutgoingRequest outgoingRequests = 5;
} }

View File

@@ -31,6 +31,7 @@ dependencies {
implementation libs.jsoup implementation libs.jsoup
implementation libs.opencsv implementation libs.opencsv
implementation libs.slop implementation libs.slop
implementation libs.zstd
implementation libs.sqlite implementation libs.sqlite
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.commons.lang3 implementation libs.commons.lang3

View File

@@ -1,6 +1,8 @@
package nu.marginalia.domsample; package nu.marginalia.domsample;
import com.github.luben.zstd.Zstd;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.protobuf.ByteString;
import io.grpc.Status; import io.grpc.Status;
import io.grpc.stub.StreamObserver; import io.grpc.stub.StreamObserver;
import nu.marginalia.api.domsample.*; import nu.marginalia.api.domsample.*;
@@ -9,6 +11,7 @@ import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
public class DomSampleGrpcService public class DomSampleGrpcService
@@ -42,7 +45,36 @@ public class DomSampleGrpcService
} }
// Grab the first sample // Grab the first sample
RpcDomainSample.Builder response = convert(dbRecords.getFirst()); RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
}
catch (Exception e) {
logger.error("Error in getSample()", e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
if (dbRecords.isEmpty()) {
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
return;
}
// Grab the first sample
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
responseObserver.onNext(response.build()); responseObserver.onNext(response.build());
responseObserver.onCompleted(); responseObserver.onCompleted();
@@ -87,7 +119,7 @@ public class DomSampleGrpcService
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName); List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
for (var record : dbRecords) { for (var record : dbRecords) {
responseObserver.onNext(convert(record).build()); responseObserver.onNext(convertFullSample(record).build());
} }
responseObserver.onCompleted(); responseObserver.onCompleted();
@@ -98,12 +130,14 @@ public class DomSampleGrpcService
} }
} }
private RpcDomainSample.Builder convert(DomSampleDb.Sample dbSample) { private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
var sampleBuilder = RpcDomainSample.newBuilder() var sampleBuilder = RpcDomainSample.newBuilder()
.setDomainName(dbSample.domain()) .setDomainName(dbSample.domain())
.setAcceptedPopover(dbSample.acceptedPopover()) .setAcceptedPopover(dbSample.acceptedPopover())
.setHtmlSample(dbSample.sample()); .setHtmlSampleZstd(htmlZstd);
for (var req : dbSample.parseRequests()) { for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder() sampleBuilder.addOutgoingRequestsBuilder()
@@ -120,4 +154,23 @@ public class DomSampleGrpcService
return sampleBuilder; return sampleBuilder;
} }
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
.setDomainName(dbSample.domain());
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
.setUrl(req.uri().toString())
.setMethod(switch (req.method().toUpperCase())
{
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
default -> RpcOutgoingRequest.RequestMethod.OTHER;
})
.setTimestamp(req.timestamp());
}
return sampleBuilder;
}
} }

View File

@@ -90,6 +90,7 @@ dependencies {
implementation libs.commons.lang3 implementation libs.commons.lang3
implementation libs.commons.compress implementation libs.commons.compress
implementation libs.sqlite implementation libs.sqlite
implementation libs.bundles.grpc
implementation libs.bundles.httpcomponents implementation libs.bundles.httpcomponents

View File

@@ -22,6 +22,7 @@ dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.guava implementation libs.guava
implementation libs.zstd
implementation dependencies.create(libs.guice.get()) { implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava' exclude group: 'com.google.guava'
} }

View File

@@ -1,5 +1,6 @@
package nu.marginalia.domclassifier; package nu.marginalia.domclassifier;
import com.github.luben.zstd.ZstdInputStream;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.api.domsample.RpcDomainSample; import nu.marginalia.api.domsample.RpcDomainSample;
@@ -16,9 +17,11 @@ import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.ParserConfigurationException;
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@@ -84,8 +87,9 @@ public class DomSampleClassifier {
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName()); EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
try { try (var compressedStream = new ZstdInputStream(new ByteArrayInputStream(sample.getHtmlSampleZstd().toByteArray()))) {
var parsedDoc = Jsoup.parse(sample.getHtmlSample()); String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
var parsedDoc = Jsoup.parse(html);
var fixedElements = parsedDoc.select("*[data-position=fixed]"); var fixedElements = parsedDoc.select("*[data-position=fixed]");
if (sample.getAcceptedPopover()) { if (sample.getAcceptedPopover()) {
@@ -104,7 +108,7 @@ public class DomSampleClassifier {
} }
} }
catch (Exception ex) { catch (Exception ex) {
logger.warn("Error when parsing DOM HTML sample"); logger.warn("Error when parsing DOM HTML sample for size" + sample.getHtmlSampleZstd().size(), ex);
} }
// Classify outgoing requests // Classify outgoing requests

View File

@@ -1,6 +1,8 @@
package nu.marginalia.converting.processor; package nu.marginalia.converting.processor;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.StatusRuntimeException;
import nu.marginalia.api.domsample.DomSampleClient; import nu.marginalia.api.domsample.DomSampleClient;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSource;
@@ -100,7 +102,10 @@ public class DomainProcessor {
.thenApply(domSampleClassifier::classifySample) .thenApply(domSampleClassifier::classifySample)
.handle((a,b) -> { .handle((a,b) -> {
if (b != null) { if (b != null) {
logger.warn("Exception when fetching sample data", b); var cause = b.getCause();
if (!(cause instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
logger.warn("Exception when fetching sample data", b);
}
return EnumSet.of(DomSampleClassification.UNCLASSIFIED); return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
} }
return a; return a;

View File

@@ -11,7 +11,7 @@ import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation; import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.domsample.DomSampleClient; import nu.marginalia.api.domsample.DomSampleClient;
import nu.marginalia.api.domsample.RpcDomainSample; import nu.marginalia.api.domsample.RpcDomainSampleRequests;
import nu.marginalia.api.domsample.RpcOutgoingRequest; import nu.marginalia.api.domsample.RpcOutgoingRequest;
import nu.marginalia.api.feeds.FeedsClient; import nu.marginalia.api.feeds.FeedsClient;
import nu.marginalia.api.feeds.RpcFeed; import nu.marginalia.api.feeds.RpcFeed;
@@ -399,7 +399,7 @@ public class SearchSiteInfoService {
return forServiceUnavailable(domainName); return forServiceUnavailable(domainName);
} }
Optional<RpcDomainSample> sample = domSampleClient.getSample(domainName.toLowerCase()); Optional<RpcDomainSampleRequests> sample = domSampleClient.getSampleRequests(domainName.toLowerCase());
if (sample.isEmpty()) { if (sample.isEmpty()) {
return forNoData(domainName); return forNoData(domainName);
} }