mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
7 Commits
deploy-029
...
deploy-029
Author | SHA1 | Date | |
---|---|---|---|
|
f1a900f383 | ||
|
700364b86d | ||
|
7e725ddaed | ||
|
120209e138 | ||
|
a771a5b6ce | ||
|
dac5b54128 | ||
|
6cfb143c15 |
@@ -47,6 +47,21 @@ public class DomSampleClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
|
||||||
|
try {
|
||||||
|
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
|
||||||
|
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||||
|
|
||||||
|
return Optional.of(val);
|
||||||
|
}
|
||||||
|
catch (StatusRuntimeException sre) {
|
||||||
|
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||||
|
logger.error("Failed to fetch DOM sample", sre);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public boolean hasSample(String domainName) {
|
public boolean hasSample(String domainName) {
|
||||||
try {
|
try {
|
||||||
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
||||||
|
@@ -7,6 +7,7 @@ option java_multiple_files=true;
|
|||||||
|
|
||||||
service DomSampleApi {
|
service DomSampleApi {
|
||||||
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
|
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
|
||||||
|
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
|
||||||
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
|
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
|
||||||
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
|
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
|
||||||
}
|
}
|
||||||
@@ -19,10 +20,16 @@ message RpcBooleanRsp {
|
|||||||
bool answer = 1;
|
bool answer = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message RpcDomainSampleRequests {
|
||||||
|
string domainName = 1;
|
||||||
|
string url = 2;
|
||||||
|
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||||
|
}
|
||||||
|
|
||||||
message RpcDomainSample {
|
message RpcDomainSample {
|
||||||
string domainName = 1;
|
string domainName = 1;
|
||||||
string url = 2;
|
string url = 2;
|
||||||
string htmlSample = 3;
|
bytes htmlSampleZstd = 3;
|
||||||
bool accepted_popover = 4;
|
bool accepted_popover = 4;
|
||||||
repeated RpcOutgoingRequest outgoingRequests = 5;
|
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||||
}
|
}
|
||||||
|
@@ -31,6 +31,7 @@ dependencies {
|
|||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.slop
|
implementation libs.slop
|
||||||
|
implementation libs.zstd
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.domsample;
|
package nu.marginalia.domsample;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.Zstd;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import com.google.protobuf.ByteString;
|
||||||
import io.grpc.Status;
|
import io.grpc.Status;
|
||||||
import io.grpc.stub.StreamObserver;
|
import io.grpc.stub.StreamObserver;
|
||||||
import nu.marginalia.api.domsample.*;
|
import nu.marginalia.api.domsample.*;
|
||||||
@@ -9,6 +11,7 @@ import nu.marginalia.service.server.DiscoverableService;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class DomSampleGrpcService
|
public class DomSampleGrpcService
|
||||||
@@ -42,7 +45,36 @@ public class DomSampleGrpcService
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Grab the first sample
|
// Grab the first sample
|
||||||
RpcDomainSample.Builder response = convert(dbRecords.getFirst());
|
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
|
||||||
|
|
||||||
|
responseObserver.onNext(response.build());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error in getSample()", e);
|
||||||
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
|
||||||
|
String domainName = request.getDomainName();
|
||||||
|
if (domainName.isBlank()) {
|
||||||
|
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||||
|
.withDescription("Invalid domain name")
|
||||||
|
.asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||||
|
if (dbRecords.isEmpty()) {
|
||||||
|
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab the first sample
|
||||||
|
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
|
||||||
|
|
||||||
responseObserver.onNext(response.build());
|
responseObserver.onNext(response.build());
|
||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
@@ -87,7 +119,7 @@ public class DomSampleGrpcService
|
|||||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||||
|
|
||||||
for (var record : dbRecords) {
|
for (var record : dbRecords) {
|
||||||
responseObserver.onNext(convert(record).build());
|
responseObserver.onNext(convertFullSample(record).build());
|
||||||
}
|
}
|
||||||
|
|
||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
@@ -98,12 +130,14 @@ public class DomSampleGrpcService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private RpcDomainSample.Builder convert(DomSampleDb.Sample dbSample) {
|
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
|
||||||
|
|
||||||
|
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
|
||||||
|
|
||||||
var sampleBuilder = RpcDomainSample.newBuilder()
|
var sampleBuilder = RpcDomainSample.newBuilder()
|
||||||
.setDomainName(dbSample.domain())
|
.setDomainName(dbSample.domain())
|
||||||
.setAcceptedPopover(dbSample.acceptedPopover())
|
.setAcceptedPopover(dbSample.acceptedPopover())
|
||||||
.setHtmlSample(dbSample.sample());
|
.setHtmlSampleZstd(htmlZstd);
|
||||||
|
|
||||||
for (var req : dbSample.parseRequests()) {
|
for (var req : dbSample.parseRequests()) {
|
||||||
sampleBuilder.addOutgoingRequestsBuilder()
|
sampleBuilder.addOutgoingRequestsBuilder()
|
||||||
@@ -120,4 +154,23 @@ public class DomSampleGrpcService
|
|||||||
return sampleBuilder;
|
return sampleBuilder;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
|
||||||
|
|
||||||
|
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
|
||||||
|
.setDomainName(dbSample.domain());
|
||||||
|
|
||||||
|
for (var req : dbSample.parseRequests()) {
|
||||||
|
sampleBuilder.addOutgoingRequestsBuilder()
|
||||||
|
.setUrl(req.uri().toString())
|
||||||
|
.setMethod(switch (req.method().toUpperCase())
|
||||||
|
{
|
||||||
|
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||||
|
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||||
|
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||||
|
})
|
||||||
|
.setTimestamp(req.timestamp());
|
||||||
|
}
|
||||||
|
|
||||||
|
return sampleBuilder;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -22,6 +22,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
|
implementation libs.zstd
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
exclude group: 'com.google.guava'
|
exclude group: 'com.google.guava'
|
||||||
}
|
}
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.domclassifier;
|
package nu.marginalia.domclassifier;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.api.domsample.RpcDomainSample;
|
import nu.marginalia.api.domsample.RpcDomainSample;
|
||||||
@@ -19,6 +20,7 @@ import javax.xml.parsers.ParserConfigurationException;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
@@ -84,8 +86,9 @@ public class DomSampleClassifier {
|
|||||||
|
|
||||||
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
|
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
|
||||||
|
|
||||||
try {
|
try (var compressedStream = new ZstdInputStream(sample.getHtmlSampleZstd().newInput())) {
|
||||||
var parsedDoc = Jsoup.parse(sample.getHtmlSample());
|
String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
|
||||||
|
var parsedDoc = Jsoup.parse(html);
|
||||||
var fixedElements = parsedDoc.select("*[data-position=fixed]");
|
var fixedElements = parsedDoc.select("*[data-position=fixed]");
|
||||||
|
|
||||||
if (sample.getAcceptedPopover()) {
|
if (sample.getAcceptedPopover()) {
|
||||||
@@ -104,7 +107,7 @@ public class DomSampleClassifier {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.warn("Error when parsing DOM HTML sample");
|
logger.warn("Error when parsing DOM HTML sample", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Classify outgoing requests
|
// Classify outgoing requests
|
||||||
|
@@ -102,7 +102,8 @@ public class DomainProcessor {
|
|||||||
.thenApply(domSampleClassifier::classifySample)
|
.thenApply(domSampleClassifier::classifySample)
|
||||||
.handle((a,b) -> {
|
.handle((a,b) -> {
|
||||||
if (b != null) {
|
if (b != null) {
|
||||||
if (!(b instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
|
var cause = b.getCause();
|
||||||
|
if (!(cause instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
|
||||||
logger.warn("Exception when fetching sample data", b);
|
logger.warn("Exception when fetching sample data", b);
|
||||||
}
|
}
|
||||||
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
|
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
|
||||||
|
@@ -11,7 +11,7 @@ import nu.marginalia.api.domains.DomainInfoClient;
|
|||||||
import nu.marginalia.api.domains.model.DomainInformation;
|
import nu.marginalia.api.domains.model.DomainInformation;
|
||||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||||
import nu.marginalia.api.domsample.DomSampleClient;
|
import nu.marginalia.api.domsample.DomSampleClient;
|
||||||
import nu.marginalia.api.domsample.RpcDomainSample;
|
import nu.marginalia.api.domsample.RpcDomainSampleRequests;
|
||||||
import nu.marginalia.api.domsample.RpcOutgoingRequest;
|
import nu.marginalia.api.domsample.RpcOutgoingRequest;
|
||||||
import nu.marginalia.api.feeds.FeedsClient;
|
import nu.marginalia.api.feeds.FeedsClient;
|
||||||
import nu.marginalia.api.feeds.RpcFeed;
|
import nu.marginalia.api.feeds.RpcFeed;
|
||||||
@@ -399,7 +399,7 @@ public class SearchSiteInfoService {
|
|||||||
return forServiceUnavailable(domainName);
|
return forServiceUnavailable(domainName);
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<RpcDomainSample> sample = domSampleClient.getSample(domainName.toLowerCase());
|
Optional<RpcDomainSampleRequests> sample = domSampleClient.getSampleRequests(domainName.toLowerCase());
|
||||||
if (sample.isEmpty()) {
|
if (sample.isEmpty()) {
|
||||||
return forNoData(domainName);
|
return forNoData(domainName);
|
||||||
}
|
}
|
||||||
|
@@ -38,8 +38,8 @@
|
|||||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2 items-center sm:items-stretch">
|
<div class="mx-auto px-8 flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto items-center sm:items-stretch">
|
||||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
|
<div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
|
||||||
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
|
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
|
||||||
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Prioritizes non-commercial content</li>
|
<li>Prioritizes non-commercial content</li>
|
||||||
@@ -48,14 +48,14 @@
|
|||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
|
<div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
|
||||||
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
|
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
|
||||||
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Custom index and crawler software</li>
|
<li>Custom index and crawler software</li>
|
||||||
<li>Simple technology, no AI</li>
|
<li>Simple technology, no AI</li>
|
||||||
<li>AGPL license</li>
|
<li>AGPL license</li>
|
||||||
</ul>
|
</ul>
|
||||||
<div class="flex pt-4 gap-2">
|
<div class="flex pt-4 gap-2 flex-col md:flex-row">
|
||||||
<div class="text-xs text-liteblue dark:text-blue-200">
|
<div class="text-xs text-liteblue dark:text-blue-200">
|
||||||
<i class="fa-brands fa-github"></i>
|
<i class="fa-brands fa-github"></i>
|
||||||
<a href="https://git.marginalia.nu/" class="underline">Git Repository</a>
|
<a href="https://git.marginalia.nu/" class="underline">Git Repository</a>
|
||||||
@@ -67,7 +67,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
|
<div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
|
||||||
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
|
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
|
||||||
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Filter out tracking </li>
|
<li>Filter out tracking </li>
|
||||||
|
@@ -11,4 +11,5 @@
|
|||||||
2025-05-17: Redeploy all.
|
2025-05-17: Redeploy all.
|
||||||
2025-05-28: Deploy assistant and browserless.
|
2025-05-28: Deploy assistant and browserless.
|
||||||
2025-06-06: Deploy assistant and browserless.
|
2025-06-06: Deploy assistant and browserless.
|
||||||
2025-07-21: Deploy executor partition 1.
|
2025-07-21: Deploy executor partition 1.
|
||||||
|
2025-07-21: Deploy search.
|
Reference in New Issue
Block a user