mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
2 Commits
deploy-029
...
deploy-029
Author | SHA1 | Date | |
---|---|---|---|
|
6cfb143c15 | ||
|
23c818281b |
@@ -47,6 +47,21 @@ public class DomSampleClient {
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
|
||||
try {
|
||||
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
|
||||
return Optional.of(val);
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||
logger.error("Failed to fetch DOM sample", sre);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasSample(String domainName) {
|
||||
try {
|
||||
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
||||
|
@@ -7,6 +7,7 @@ option java_multiple_files=true;
|
||||
|
||||
service DomSampleApi {
|
||||
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
|
||||
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
|
||||
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
|
||||
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
|
||||
}
|
||||
@@ -19,10 +20,16 @@ message RpcBooleanRsp {
|
||||
bool answer = 1;
|
||||
}
|
||||
|
||||
message RpcDomainSampleRequests {
|
||||
string domainName = 1;
|
||||
string url = 2;
|
||||
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||
}
|
||||
|
||||
message RpcDomainSample {
|
||||
string domainName = 1;
|
||||
string url = 2;
|
||||
string htmlSample = 3;
|
||||
bytes htmlSampleZstd = 3;
|
||||
bool accepted_popover = 4;
|
||||
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||
}
|
||||
|
@@ -31,6 +31,7 @@ dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.slop
|
||||
implementation libs.zstd
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
|
@@ -1,6 +1,8 @@
|
||||
package nu.marginalia.domsample;
|
||||
|
||||
import com.github.luben.zstd.Zstd;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.protobuf.ByteString;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.domsample.*;
|
||||
@@ -9,6 +11,7 @@ import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
public class DomSampleGrpcService
|
||||
@@ -42,7 +45,36 @@ public class DomSampleGrpcService
|
||||
}
|
||||
|
||||
// Grab the first sample
|
||||
RpcDomainSample.Builder response = convert(dbRecords.getFirst());
|
||||
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
|
||||
|
||||
responseObserver.onNext(response.build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error in getSample()", e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||
if (dbRecords.isEmpty()) {
|
||||
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
// Grab the first sample
|
||||
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
|
||||
|
||||
responseObserver.onNext(response.build());
|
||||
responseObserver.onCompleted();
|
||||
@@ -87,7 +119,7 @@ public class DomSampleGrpcService
|
||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||
|
||||
for (var record : dbRecords) {
|
||||
responseObserver.onNext(convert(record).build());
|
||||
responseObserver.onNext(convertFullSample(record).build());
|
||||
}
|
||||
|
||||
responseObserver.onCompleted();
|
||||
@@ -98,12 +130,14 @@ public class DomSampleGrpcService
|
||||
}
|
||||
}
|
||||
|
||||
private RpcDomainSample.Builder convert(DomSampleDb.Sample dbSample) {
|
||||
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
|
||||
|
||||
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
|
||||
|
||||
var sampleBuilder = RpcDomainSample.newBuilder()
|
||||
.setDomainName(dbSample.domain())
|
||||
.setAcceptedPopover(dbSample.acceptedPopover())
|
||||
.setHtmlSample(dbSample.sample());
|
||||
.setHtmlSampleZstd(htmlZstd);
|
||||
|
||||
for (var req : dbSample.parseRequests()) {
|
||||
sampleBuilder.addOutgoingRequestsBuilder()
|
||||
@@ -120,4 +154,23 @@ public class DomSampleGrpcService
|
||||
return sampleBuilder;
|
||||
}
|
||||
|
||||
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
|
||||
|
||||
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
|
||||
.setDomainName(dbSample.domain());
|
||||
|
||||
for (var req : dbSample.parseRequests()) {
|
||||
sampleBuilder.addOutgoingRequestsBuilder()
|
||||
.setUrl(req.uri().toString())
|
||||
.setMethod(switch (req.method().toUpperCase())
|
||||
{
|
||||
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||
})
|
||||
.setTimestamp(req.timestamp());
|
||||
}
|
||||
|
||||
return sampleBuilder;
|
||||
}
|
||||
}
|
||||
|
@@ -90,6 +90,7 @@ dependencies {
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.commons.compress
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.grpc
|
||||
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
|
@@ -22,6 +22,7 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
implementation libs.zstd
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.domclassifier;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domsample.RpcDomainSample;
|
||||
@@ -19,6 +20,7 @@ import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
@@ -84,8 +86,9 @@ public class DomSampleClassifier {
|
||||
|
||||
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
|
||||
|
||||
try {
|
||||
var parsedDoc = Jsoup.parse(sample.getHtmlSample());
|
||||
try (var compressedStream = new ZstdInputStream(sample.getHtmlSampleZstd().newInput())) {
|
||||
String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
|
||||
var parsedDoc = Jsoup.parse(html);
|
||||
var fixedElements = parsedDoc.select("*[data-position=fixed]");
|
||||
|
||||
if (sample.getAcceptedPopover()) {
|
||||
|
@@ -1,6 +1,8 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.StatusRuntimeException;
|
||||
import nu.marginalia.api.domsample.DomSampleClient;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
@@ -100,7 +102,10 @@ public class DomainProcessor {
|
||||
.thenApply(domSampleClassifier::classifySample)
|
||||
.handle((a,b) -> {
|
||||
if (b != null) {
|
||||
logger.warn("Exception when fetching sample data", b);
|
||||
var cause = b.getCause();
|
||||
if (!(cause instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
|
||||
logger.warn("Exception when fetching sample data", b);
|
||||
}
|
||||
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
|
||||
}
|
||||
return a;
|
||||
|
@@ -11,7 +11,7 @@ import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.domsample.DomSampleClient;
|
||||
import nu.marginalia.api.domsample.RpcDomainSample;
|
||||
import nu.marginalia.api.domsample.RpcDomainSampleRequests;
|
||||
import nu.marginalia.api.domsample.RpcOutgoingRequest;
|
||||
import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.api.feeds.RpcFeed;
|
||||
@@ -399,7 +399,7 @@ public class SearchSiteInfoService {
|
||||
return forServiceUnavailable(domainName);
|
||||
}
|
||||
|
||||
Optional<RpcDomainSample> sample = domSampleClient.getSample(domainName.toLowerCase());
|
||||
Optional<RpcDomainSampleRequests> sample = domSampleClient.getSampleRequests(domainName.toLowerCase());
|
||||
if (sample.isEmpty()) {
|
||||
return forNoData(domainName);
|
||||
}
|
||||
|
Reference in New Issue
Block a user