mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
9 Commits
deploy-028
...
deploy-029
Author | SHA1 | Date | |
---|---|---|---|
|
a771a5b6ce | ||
|
dac5b54128 | ||
|
6cfb143c15 | ||
|
23c818281b | ||
|
8aad253cf6 | ||
|
556d7af9dc | ||
|
b7a5219ed3 | ||
|
a23ec521fe | ||
|
fff3babc6d |
@@ -41,7 +41,22 @@ public class DomSampleClient {
|
|||||||
}
|
}
|
||||||
catch (StatusRuntimeException sre) {
|
catch (StatusRuntimeException sre) {
|
||||||
if (sre.getStatus() != Status.NOT_FOUND) {
|
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||||
logger.error("Failed to fetch DOM sample");
|
logger.error("Failed to fetch DOM sample", sre);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
|
||||||
|
try {
|
||||||
|
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
|
||||||
|
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||||
|
|
||||||
|
return Optional.of(val);
|
||||||
|
}
|
||||||
|
catch (StatusRuntimeException sre) {
|
||||||
|
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||||
|
logger.error("Failed to fetch DOM sample", sre);
|
||||||
}
|
}
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
@@ -7,6 +7,7 @@ option java_multiple_files=true;
|
|||||||
|
|
||||||
service DomSampleApi {
|
service DomSampleApi {
|
||||||
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
|
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
|
||||||
|
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
|
||||||
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
|
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
|
||||||
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
|
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
|
||||||
}
|
}
|
||||||
@@ -19,10 +20,16 @@ message RpcBooleanRsp {
|
|||||||
bool answer = 1;
|
bool answer = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message RpcDomainSampleRequests {
|
||||||
|
string domainName = 1;
|
||||||
|
string url = 2;
|
||||||
|
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||||
|
}
|
||||||
|
|
||||||
message RpcDomainSample {
|
message RpcDomainSample {
|
||||||
string domainName = 1;
|
string domainName = 1;
|
||||||
string url = 2;
|
string url = 2;
|
||||||
string htmlSample = 3;
|
bytes htmlSampleZstd = 3;
|
||||||
bool accepted_popover = 4;
|
bool accepted_popover = 4;
|
||||||
repeated RpcOutgoingRequest outgoingRequests = 5;
|
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||||
}
|
}
|
||||||
|
@@ -31,6 +31,7 @@ dependencies {
|
|||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.slop
|
implementation libs.slop
|
||||||
|
implementation libs.zstd
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.domsample;
|
package nu.marginalia.domsample;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.Zstd;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import com.google.protobuf.ByteString;
|
||||||
import io.grpc.Status;
|
import io.grpc.Status;
|
||||||
import io.grpc.stub.StreamObserver;
|
import io.grpc.stub.StreamObserver;
|
||||||
import nu.marginalia.api.domsample.*;
|
import nu.marginalia.api.domsample.*;
|
||||||
@@ -9,6 +11,7 @@ import nu.marginalia.service.server.DiscoverableService;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class DomSampleGrpcService
|
public class DomSampleGrpcService
|
||||||
@@ -42,7 +45,36 @@ public class DomSampleGrpcService
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Grab the first sample
|
// Grab the first sample
|
||||||
RpcDomainSample.Builder response = convert(dbRecords.getFirst());
|
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
|
||||||
|
|
||||||
|
responseObserver.onNext(response.build());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error in getSample()", e);
|
||||||
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
|
||||||
|
String domainName = request.getDomainName();
|
||||||
|
if (domainName.isBlank()) {
|
||||||
|
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||||
|
.withDescription("Invalid domain name")
|
||||||
|
.asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||||
|
if (dbRecords.isEmpty()) {
|
||||||
|
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab the first sample
|
||||||
|
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
|
||||||
|
|
||||||
responseObserver.onNext(response.build());
|
responseObserver.onNext(response.build());
|
||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
@@ -87,7 +119,7 @@ public class DomSampleGrpcService
|
|||||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||||
|
|
||||||
for (var record : dbRecords) {
|
for (var record : dbRecords) {
|
||||||
responseObserver.onNext(convert(record).build());
|
responseObserver.onNext(convertFullSample(record).build());
|
||||||
}
|
}
|
||||||
|
|
||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
@@ -98,12 +130,14 @@ public class DomSampleGrpcService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private RpcDomainSample.Builder convert(DomSampleDb.Sample dbSample) {
|
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
|
||||||
|
|
||||||
|
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
|
||||||
|
|
||||||
var sampleBuilder = RpcDomainSample.newBuilder()
|
var sampleBuilder = RpcDomainSample.newBuilder()
|
||||||
.setDomainName(dbSample.domain())
|
.setDomainName(dbSample.domain())
|
||||||
.setAcceptedPopover(dbSample.acceptedPopover())
|
.setAcceptedPopover(dbSample.acceptedPopover())
|
||||||
.setHtmlSample(dbSample.sample());
|
.setHtmlSampleZstd(htmlZstd);
|
||||||
|
|
||||||
for (var req : dbSample.parseRequests()) {
|
for (var req : dbSample.parseRequests()) {
|
||||||
sampleBuilder.addOutgoingRequestsBuilder()
|
sampleBuilder.addOutgoingRequestsBuilder()
|
||||||
@@ -120,4 +154,23 @@ public class DomSampleGrpcService
|
|||||||
return sampleBuilder;
|
return sampleBuilder;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
|
||||||
|
|
||||||
|
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
|
||||||
|
.setDomainName(dbSample.domain());
|
||||||
|
|
||||||
|
for (var req : dbSample.parseRequests()) {
|
||||||
|
sampleBuilder.addOutgoingRequestsBuilder()
|
||||||
|
.setUrl(req.uri().toString())
|
||||||
|
.setMethod(switch (req.method().toUpperCase())
|
||||||
|
{
|
||||||
|
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||||
|
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||||
|
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||||
|
})
|
||||||
|
.setTimestamp(req.timestamp());
|
||||||
|
}
|
||||||
|
|
||||||
|
return sampleBuilder;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -90,6 +90,7 @@ dependencies {
|
|||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
implementation libs.commons.compress
|
implementation libs.commons.compress
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
|
implementation libs.bundles.grpc
|
||||||
|
|
||||||
implementation libs.bundles.httpcomponents
|
implementation libs.bundles.httpcomponents
|
||||||
|
|
||||||
|
@@ -22,6 +22,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
|
implementation libs.zstd
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
exclude group: 'com.google.guava'
|
exclude group: 'com.google.guava'
|
||||||
}
|
}
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.domclassifier;
|
package nu.marginalia.domclassifier;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.api.domsample.RpcDomainSample;
|
import nu.marginalia.api.domsample.RpcDomainSample;
|
||||||
@@ -16,9 +17,11 @@ import org.xml.sax.SAXException;
|
|||||||
import javax.xml.parsers.DocumentBuilder;
|
import javax.xml.parsers.DocumentBuilder;
|
||||||
import javax.xml.parsers.DocumentBuilderFactory;
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
import javax.xml.parsers.ParserConfigurationException;
|
import javax.xml.parsers.ParserConfigurationException;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
@@ -84,8 +87,9 @@ public class DomSampleClassifier {
|
|||||||
|
|
||||||
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
|
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
|
||||||
|
|
||||||
try {
|
try (var compressedStream = new ZstdInputStream(new ByteArrayInputStream(sample.getHtmlSampleZstd().toByteArray()))) {
|
||||||
var parsedDoc = Jsoup.parse(sample.getHtmlSample());
|
String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
|
||||||
|
var parsedDoc = Jsoup.parse(html);
|
||||||
var fixedElements = parsedDoc.select("*[data-position=fixed]");
|
var fixedElements = parsedDoc.select("*[data-position=fixed]");
|
||||||
|
|
||||||
if (sample.getAcceptedPopover()) {
|
if (sample.getAcceptedPopover()) {
|
||||||
@@ -104,7 +108,7 @@ public class DomSampleClassifier {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.warn("Error when parsing DOM HTML sample");
|
logger.warn("Error when parsing DOM HTML sample for size" + sample.getHtmlSampleZstd().size(), ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Classify outgoing requests
|
// Classify outgoing requests
|
||||||
|
@@ -15,6 +15,7 @@
|
|||||||
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
|
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
|
||||||
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
|
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
|
||||||
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
|
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
|
||||||
|
<classifier target="url-regex" rule="tracking">^/pixel\.gif$</classifier>
|
||||||
<classifier target="url-regex" rule="ads">/pagead/</classifier>
|
<classifier target="url-regex" rule="ads">/pagead/</classifier>
|
||||||
<classifier target="url-regex" rule="ads">/google-ads/</classifier>
|
<classifier target="url-regex" rule="ads">/google-ads/</classifier>
|
||||||
|
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.converting.processor;
|
package nu.marginalia.converting.processor;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import io.grpc.Status;
|
||||||
|
import io.grpc.StatusRuntimeException;
|
||||||
import nu.marginalia.api.domsample.DomSampleClient;
|
import nu.marginalia.api.domsample.DomSampleClient;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||||
@@ -98,10 +100,16 @@ public class DomainProcessor {
|
|||||||
return domSampleClient
|
return domSampleClient
|
||||||
.getSampleAsync(domainName, domSampleExecutor)
|
.getSampleAsync(domainName, domSampleExecutor)
|
||||||
.thenApply(domSampleClassifier::classifySample)
|
.thenApply(domSampleClassifier::classifySample)
|
||||||
.handle((a,b) ->
|
.handle((a,b) -> {
|
||||||
Objects.requireNonNullElseGet(a,
|
if (b != null) {
|
||||||
() -> EnumSet.of(DomSampleClassification.UNCLASSIFIED)))
|
var cause = b.getCause();
|
||||||
.get();
|
if (!(cause instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
|
||||||
|
logger.warn("Exception when fetching sample data", b);
|
||||||
|
}
|
||||||
|
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}).get();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
|
@@ -161,7 +161,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||||
|
|
||||||
|
|
||||||
if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
|
if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
|
||||||
features.add(HtmlFeature.SHORT_DOCUMENT);
|
features.add(HtmlFeature.SHORT_DOCUMENT);
|
||||||
}
|
}
|
||||||
|
@@ -115,7 +115,9 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
ret.quality = -5;
|
ret.quality = -5;
|
||||||
|
|
||||||
ret.features = Set.of(HtmlFeature.PDF);
|
ret.features = new HashSet<>(); // must be mutable!
|
||||||
|
ret.features.add(HtmlFeature.PDF);
|
||||||
|
|
||||||
ret.description = getDescription(doc);
|
ret.description = getDescription(doc);
|
||||||
ret.hashCode = dld.localitySensitiveHashCode();
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
|
@@ -11,7 +11,7 @@ import nu.marginalia.api.domains.DomainInfoClient;
|
|||||||
import nu.marginalia.api.domains.model.DomainInformation;
|
import nu.marginalia.api.domains.model.DomainInformation;
|
||||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||||
import nu.marginalia.api.domsample.DomSampleClient;
|
import nu.marginalia.api.domsample.DomSampleClient;
|
||||||
import nu.marginalia.api.domsample.RpcDomainSample;
|
import nu.marginalia.api.domsample.RpcDomainSampleRequests;
|
||||||
import nu.marginalia.api.domsample.RpcOutgoingRequest;
|
import nu.marginalia.api.domsample.RpcOutgoingRequest;
|
||||||
import nu.marginalia.api.feeds.FeedsClient;
|
import nu.marginalia.api.feeds.FeedsClient;
|
||||||
import nu.marginalia.api.feeds.RpcFeed;
|
import nu.marginalia.api.feeds.RpcFeed;
|
||||||
@@ -399,7 +399,7 @@ public class SearchSiteInfoService {
|
|||||||
return forServiceUnavailable(domainName);
|
return forServiceUnavailable(domainName);
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<RpcDomainSample> sample = domSampleClient.getSample(domainName.toLowerCase());
|
Optional<RpcDomainSampleRequests> sample = domSampleClient.getSampleRequests(domainName.toLowerCase());
|
||||||
if (sample.isEmpty()) {
|
if (sample.isEmpty()) {
|
||||||
return forNoData(domainName);
|
return forNoData(domainName);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user