1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

8 Commits

Author SHA1 Message Date
Viktor Lofgren
6cfb143c15 (sample) Compress sample HTML data and introduce new API for only getting requests 2025-07-21 13:55:25 +02:00
Viktor Lofgren
23c818281b (converter) Reduce DomSample logging for NOT_FOUND 2025-07-21 13:37:55 +02:00
Viktor Lofgren
8aad253cf6 (converter) Add more logging around dom sample data retrieval errors 2025-07-21 13:26:38 +02:00
Viktor Lofgren
556d7af9dc Reapply "(grpc) Use grpc-netty instead of grpc-netty-shaded"
This reverts commit b7a5219ed3.
2025-07-21 13:23:32 +02:00
Viktor Lofgren
b7a5219ed3 Revert "(grpc) Use grpc-netty instead of grpc-netty-shaded"
Reverting this change to see if it's the cause of some instability issues observed.
2025-07-21 13:10:41 +02:00
Viktor Lofgren
a23ec521fe (converter) Ensure features is mutable on DetailsWithWords as this is assumed later 2025-07-21 12:50:04 +02:00
Viktor Lofgren
fff3babc6d (classier) Add rule for */pixel.gif as likely tracking pixels 2025-07-21 12:35:57 +02:00
Viktor Lofgren
b2bfb8217c (special) Trigger CD run 2025-07-21 12:28:24 +02:00
13 changed files with 109 additions and 17 deletions

View File

@@ -41,7 +41,22 @@ public class DomSampleClient {
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample");
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}
}
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
try {
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
return Optional.of(val);
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}

View File

@@ -7,6 +7,7 @@ option java_multiple_files=true;
service DomSampleApi {
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
}
@@ -19,10 +20,16 @@ message RpcBooleanRsp {
bool answer = 1;
}
message RpcDomainSampleRequests {
string domainName = 1;
string url = 2;
repeated RpcOutgoingRequest outgoingRequests = 5;
}
message RpcDomainSample {
string domainName = 1;
string url = 2;
string htmlSample = 3;
bytes htmlSampleZstd = 3;
bool accepted_popover = 4;
repeated RpcOutgoingRequest outgoingRequests = 5;
}

View File

@@ -31,6 +31,7 @@ dependencies {
implementation libs.jsoup
implementation libs.opencsv
implementation libs.slop
implementation libs.zstd
implementation libs.sqlite
implementation libs.bundles.slf4j
implementation libs.commons.lang3

View File

@@ -1,6 +1,8 @@
package nu.marginalia.domsample;
import com.github.luben.zstd.Zstd;
import com.google.inject.Inject;
import com.google.protobuf.ByteString;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.api.domsample.*;
@@ -9,6 +11,7 @@ import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.List;
public class DomSampleGrpcService
@@ -42,7 +45,36 @@ public class DomSampleGrpcService
}
// Grab the first sample
RpcDomainSample.Builder response = convert(dbRecords.getFirst());
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
}
catch (Exception e) {
logger.error("Error in getSample()", e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
if (dbRecords.isEmpty()) {
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
return;
}
// Grab the first sample
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
@@ -87,7 +119,7 @@ public class DomSampleGrpcService
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
for (var record : dbRecords) {
responseObserver.onNext(convert(record).build());
responseObserver.onNext(convertFullSample(record).build());
}
responseObserver.onCompleted();
@@ -98,12 +130,14 @@ public class DomSampleGrpcService
}
}
private RpcDomainSample.Builder convert(DomSampleDb.Sample dbSample) {
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
var sampleBuilder = RpcDomainSample.newBuilder()
.setDomainName(dbSample.domain())
.setAcceptedPopover(dbSample.acceptedPopover())
.setHtmlSample(dbSample.sample());
.setHtmlSampleZstd(htmlZstd);
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
@@ -120,4 +154,23 @@ public class DomSampleGrpcService
return sampleBuilder;
}
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
.setDomainName(dbSample.domain());
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
.setUrl(req.uri().toString())
.setMethod(switch (req.method().toUpperCase())
{
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
default -> RpcOutgoingRequest.RequestMethod.OTHER;
})
.setTimestamp(req.timestamp());
}
return sampleBuilder;
}
}

View File

@@ -90,6 +90,7 @@ dependencies {
implementation libs.commons.lang3
implementation libs.commons.compress
implementation libs.sqlite
implementation libs.bundles.grpc
implementation libs.bundles.httpcomponents

View File

@@ -22,6 +22,7 @@ dependencies {
implementation libs.bundles.slf4j
implementation libs.guava
implementation libs.zstd
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}

View File

@@ -1,5 +1,6 @@
package nu.marginalia.domclassifier;
import com.github.luben.zstd.ZstdInputStream;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.domsample.RpcDomainSample;
@@ -19,6 +20,7 @@ import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@@ -84,8 +86,9 @@ public class DomSampleClassifier {
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
try {
var parsedDoc = Jsoup.parse(sample.getHtmlSample());
try (var compressedStream = new ZstdInputStream(sample.getHtmlSampleZstd().newInput())) {
String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
var parsedDoc = Jsoup.parse(html);
var fixedElements = parsedDoc.select("*[data-position=fixed]");
if (sample.getAcceptedPopover()) {

View File

@@ -15,6 +15,7 @@
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
<classifier target="url-regex" rule="tracking">^/pixel\.gif$</classifier>
<classifier target="url-regex" rule="ads">/pagead/</classifier>
<classifier target="url-regex" rule="ads">/google-ads/</classifier>

View File

@@ -1,6 +1,8 @@
package nu.marginalia.converting.processor;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.StatusRuntimeException;
import nu.marginalia.api.domsample.DomSampleClient;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSource;
@@ -98,10 +100,16 @@ public class DomainProcessor {
return domSampleClient
.getSampleAsync(domainName, domSampleExecutor)
.thenApply(domSampleClassifier::classifySample)
.handle((a,b) ->
Objects.requireNonNullElseGet(a,
() -> EnumSet.of(DomSampleClassification.UNCLASSIFIED)))
.get();
.handle((a,b) -> {
if (b != null) {
var cause = b.getCause();
if (!(cause instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
logger.warn("Exception when fetching sample data", b);
}
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
}
return a;
}).get();
}
@Nullable

View File

@@ -161,7 +161,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
features.add(HtmlFeature.SHORT_DOCUMENT);
}

View File

@@ -115,7 +115,9 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.quality = -5;
ret.features = Set.of(HtmlFeature.PDF);
ret.features = new HashSet<>(); // must be mutable!
ret.features.add(HtmlFeature.PDF);
ret.description = getDescription(doc);
ret.hashCode = dld.localitySensitiveHashCode();

View File

@@ -11,7 +11,7 @@ import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.domsample.DomSampleClient;
import nu.marginalia.api.domsample.RpcDomainSample;
import nu.marginalia.api.domsample.RpcDomainSampleRequests;
import nu.marginalia.api.domsample.RpcOutgoingRequest;
import nu.marginalia.api.feeds.FeedsClient;
import nu.marginalia.api.feeds.RpcFeed;
@@ -399,7 +399,7 @@ public class SearchSiteInfoService {
return forServiceUnavailable(domainName);
}
Optional<RpcDomainSample> sample = domSampleClient.getSample(domainName.toLowerCase());
Optional<RpcDomainSampleRequests> sample = domSampleClient.getSampleRequests(domainName.toLowerCase());
if (sample.isEmpty()) {
return forNoData(domainName);
}

View File

@@ -10,4 +10,5 @@
2025-05-08: Deploy assistant.
2025-05-17: Redeploy all.
2025-05-28: Deploy assistant and browserless.
2025-06-06: Deploy assistant and browserless.
2025-06-06: Deploy assistant and browserless.
2025-07-21: Deploy executor partition 1.