1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Viktor Lofgren
f1a900f383 (search) Clean up front page mobile design a bit 2025-07-23 12:20:40 +02:00
Viktor Lofgren
700364b86d (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 15:08:20 +02:00
Viktor Lofgren
7e725ddaed (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 14:41:59 +02:00
Viktor Lofgren
120209e138 (sample) Diagnosing compression errors 2025-07-21 14:34:08 +02:00
Viktor Lofgren
a771a5b6ce (sample) Test different approach to decoding 2025-07-21 14:19:01 +02:00
Viktor Lofgren
dac5b54128 (sample) Better logging for sample errors 2025-07-21 14:03:58 +02:00
Viktor Lofgren
6cfb143c15 (sample) Compress sample HTML data and introduce new API for only getting requests 2025-07-21 13:55:25 +02:00
10 changed files with 99 additions and 17 deletions

View File

@@ -47,6 +47,21 @@ public class DomSampleClient {
} }
} }
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
try {
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
return Optional.of(val);
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}
}
public boolean hasSample(String domainName) { public boolean hasSample(String domainName) {
try { try {
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample) return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)

View File

@@ -7,6 +7,7 @@ option java_multiple_files=true;
service DomSampleApi { service DomSampleApi {
rpc getSample(RpcDomainName) returns (RpcDomainSample) {} rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {} rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {} rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
} }
@@ -19,10 +20,16 @@ message RpcBooleanRsp {
bool answer = 1; bool answer = 1;
} }
message RpcDomainSampleRequests {
string domainName = 1;
string url = 2;
repeated RpcOutgoingRequest outgoingRequests = 5;
}
message RpcDomainSample { message RpcDomainSample {
string domainName = 1; string domainName = 1;
string url = 2; string url = 2;
string htmlSample = 3; bytes htmlSampleZstd = 3;
bool accepted_popover = 4; bool accepted_popover = 4;
repeated RpcOutgoingRequest outgoingRequests = 5; repeated RpcOutgoingRequest outgoingRequests = 5;
} }

View File

@@ -31,6 +31,7 @@ dependencies {
implementation libs.jsoup implementation libs.jsoup
implementation libs.opencsv implementation libs.opencsv
implementation libs.slop implementation libs.slop
implementation libs.zstd
implementation libs.sqlite implementation libs.sqlite
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.commons.lang3 implementation libs.commons.lang3

View File

@@ -1,6 +1,8 @@
package nu.marginalia.domsample; package nu.marginalia.domsample;
import com.github.luben.zstd.Zstd;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.protobuf.ByteString;
import io.grpc.Status; import io.grpc.Status;
import io.grpc.stub.StreamObserver; import io.grpc.stub.StreamObserver;
import nu.marginalia.api.domsample.*; import nu.marginalia.api.domsample.*;
@@ -9,6 +11,7 @@ import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
public class DomSampleGrpcService public class DomSampleGrpcService
@@ -42,7 +45,36 @@ public class DomSampleGrpcService
} }
// Grab the first sample // Grab the first sample
RpcDomainSample.Builder response = convert(dbRecords.getFirst()); RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
}
catch (Exception e) {
logger.error("Error in getSample()", e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
if (dbRecords.isEmpty()) {
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
return;
}
// Grab the first sample
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
responseObserver.onNext(response.build()); responseObserver.onNext(response.build());
responseObserver.onCompleted(); responseObserver.onCompleted();
@@ -87,7 +119,7 @@ public class DomSampleGrpcService
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName); List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
for (var record : dbRecords) { for (var record : dbRecords) {
responseObserver.onNext(convert(record).build()); responseObserver.onNext(convertFullSample(record).build());
} }
responseObserver.onCompleted(); responseObserver.onCompleted();
@@ -98,12 +130,14 @@ public class DomSampleGrpcService
} }
} }
private RpcDomainSample.Builder convert(DomSampleDb.Sample dbSample) { private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
var sampleBuilder = RpcDomainSample.newBuilder() var sampleBuilder = RpcDomainSample.newBuilder()
.setDomainName(dbSample.domain()) .setDomainName(dbSample.domain())
.setAcceptedPopover(dbSample.acceptedPopover()) .setAcceptedPopover(dbSample.acceptedPopover())
.setHtmlSample(dbSample.sample()); .setHtmlSampleZstd(htmlZstd);
for (var req : dbSample.parseRequests()) { for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder() sampleBuilder.addOutgoingRequestsBuilder()
@@ -120,4 +154,23 @@ public class DomSampleGrpcService
return sampleBuilder; return sampleBuilder;
} }
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
.setDomainName(dbSample.domain());
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
.setUrl(req.uri().toString())
.setMethod(switch (req.method().toUpperCase())
{
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
default -> RpcOutgoingRequest.RequestMethod.OTHER;
})
.setTimestamp(req.timestamp());
}
return sampleBuilder;
}
} }

View File

@@ -22,6 +22,7 @@ dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.guava implementation libs.guava
implementation libs.zstd
implementation dependencies.create(libs.guice.get()) { implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava' exclude group: 'com.google.guava'
} }

View File

@@ -1,5 +1,6 @@
package nu.marginalia.domclassifier; package nu.marginalia.domclassifier;
import com.github.luben.zstd.ZstdInputStream;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.api.domsample.RpcDomainSample; import nu.marginalia.api.domsample.RpcDomainSample;
@@ -19,6 +20,7 @@ import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@@ -84,8 +86,9 @@ public class DomSampleClassifier {
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName()); EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
try { try (var compressedStream = new ZstdInputStream(sample.getHtmlSampleZstd().newInput())) {
var parsedDoc = Jsoup.parse(sample.getHtmlSample()); String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
var parsedDoc = Jsoup.parse(html);
var fixedElements = parsedDoc.select("*[data-position=fixed]"); var fixedElements = parsedDoc.select("*[data-position=fixed]");
if (sample.getAcceptedPopover()) { if (sample.getAcceptedPopover()) {
@@ -104,7 +107,7 @@ public class DomSampleClassifier {
} }
} }
catch (Exception ex) { catch (Exception ex) {
logger.warn("Error when parsing DOM HTML sample"); logger.warn("Error when parsing DOM HTML sample", ex);
} }
// Classify outgoing requests // Classify outgoing requests

View File

@@ -102,7 +102,8 @@ public class DomainProcessor {
.thenApply(domSampleClassifier::classifySample) .thenApply(domSampleClassifier::classifySample)
.handle((a,b) -> { .handle((a,b) -> {
if (b != null) { if (b != null) {
if (!(b instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) { var cause = b.getCause();
if (!(cause instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
logger.warn("Exception when fetching sample data", b); logger.warn("Exception when fetching sample data", b);
} }
return EnumSet.of(DomSampleClassification.UNCLASSIFIED); return EnumSet.of(DomSampleClassification.UNCLASSIFIED);

View File

@@ -11,7 +11,7 @@ import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation; import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.domsample.DomSampleClient; import nu.marginalia.api.domsample.DomSampleClient;
import nu.marginalia.api.domsample.RpcDomainSample; import nu.marginalia.api.domsample.RpcDomainSampleRequests;
import nu.marginalia.api.domsample.RpcOutgoingRequest; import nu.marginalia.api.domsample.RpcOutgoingRequest;
import nu.marginalia.api.feeds.FeedsClient; import nu.marginalia.api.feeds.FeedsClient;
import nu.marginalia.api.feeds.RpcFeed; import nu.marginalia.api.feeds.RpcFeed;
@@ -399,7 +399,7 @@ public class SearchSiteInfoService {
return forServiceUnavailable(domainName); return forServiceUnavailable(domainName);
} }
Optional<RpcDomainSample> sample = domSampleClient.getSample(domainName.toLowerCase()); Optional<RpcDomainSampleRequests> sample = domSampleClient.getSampleRequests(domainName.toLowerCase());
if (sample.isEmpty()) { if (sample.isEmpty()) {
return forNoData(domainName); return forNoData(domainName);
} }

View File

@@ -38,8 +38,8 @@
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>. <a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
</div> </div>
</div> </div>
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2 items-center sm:items-stretch"> <div class="mx-auto px-8 flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto items-center sm:items-stretch">
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64"> <div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div> <div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5"> <ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Prioritizes non-commercial content</li> <li>Prioritizes non-commercial content</li>
@@ -48,14 +48,14 @@
</ul> </ul>
</div> </div>
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64"> <div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div> <div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5"> <ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Custom index and crawler software</li> <li>Custom index and crawler software</li>
<li>Simple technology, no AI</li> <li>Simple technology, no AI</li>
<li>AGPL license</li> <li>AGPL license</li>
</ul> </ul>
<div class="flex pt-4 gap-2"> <div class="flex pt-4 gap-2 flex-col md:flex-row">
<div class="text-xs text-liteblue dark:text-blue-200"> <div class="text-xs text-liteblue dark:text-blue-200">
<i class="fa-brands fa-github"></i> <i class="fa-brands fa-github"></i>
<a href="https://git.marginalia.nu/" class="underline">Git Repository</a> <a href="https://git.marginalia.nu/" class="underline">Git Repository</a>
@@ -67,7 +67,7 @@
</div> </div>
</div> </div>
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64"> <div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div> <div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5"> <ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Filter out tracking </li> <li>Filter out tracking </li>

View File

@@ -11,4 +11,5 @@
2025-05-17: Redeploy all. 2025-05-17: Redeploy all.
2025-05-28: Deploy assistant and browserless. 2025-05-28: Deploy assistant and browserless.
2025-06-06: Deploy assistant and browserless. 2025-06-06: Deploy assistant and browserless.
2025-07-21: Deploy executor partition 1. 2025-07-21: Deploy executor partition 1.
2025-07-21: Deploy search.