1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

6 Commits

Author SHA1 Message Date
Viktor Lofgren
8aad253cf6 (converter) Add more logging around dom sample data retrieval errors 2025-07-21 13:26:38 +02:00
Viktor Lofgren
556d7af9dc Reapply "(grpc) Use grpc-netty instead of grpc-netty-shaded"
This reverts commit b7a5219ed3.
2025-07-21 13:23:32 +02:00
Viktor Lofgren
b7a5219ed3 Revert "(grpc) Use grpc-netty instead of grpc-netty-shaded"
Reverting this change to see if it's the cause of some instability issues observed.
2025-07-21 13:10:41 +02:00
Viktor Lofgren
a23ec521fe (converter) Ensure features is mutable on DetailsWithWords as this is assumed later 2025-07-21 12:50:04 +02:00
Viktor Lofgren
fff3babc6d (classier) Add rule for */pixel.gif as likely tracking pixels 2025-07-21 12:35:57 +02:00
Viktor Lofgren
b2bfb8217c (special) Trigger CD run 2025-07-21 12:28:24 +02:00
6 changed files with 14 additions and 8 deletions

View File

@@ -41,7 +41,7 @@ public class DomSampleClient {
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample");
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}

View File

@@ -15,6 +15,7 @@
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
<classifier target="url-regex" rule="tracking">^/pixel\.gif$</classifier>
<classifier target="url-regex" rule="ads">/pagead/</classifier>
<classifier target="url-regex" rule="ads">/google-ads/</classifier>

View File

@@ -98,10 +98,13 @@ public class DomainProcessor {
return domSampleClient
.getSampleAsync(domainName, domSampleExecutor)
.thenApply(domSampleClassifier::classifySample)
.handle((a,b) ->
Objects.requireNonNullElseGet(a,
() -> EnumSet.of(DomSampleClassification.UNCLASSIFIED)))
.get();
.handle((a,b) -> {
if (b != null) {
logger.warn("Exception when fetching sample data", b);
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
}
return a;
}).get();
}
@Nullable

View File

@@ -161,7 +161,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
features.add(HtmlFeature.SHORT_DOCUMENT);
}

View File

@@ -115,7 +115,9 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.quality = -5;
ret.features = Set.of(HtmlFeature.PDF);
ret.features = new HashSet<>(); // must be mutable!
ret.features.add(HtmlFeature.PDF);
ret.description = getDescription(doc);
ret.hashCode = dld.localitySensitiveHashCode();

View File

@@ -10,4 +10,5 @@
2025-05-08: Deploy assistant.
2025-05-17: Redeploy all.
2025-05-28: Deploy assistant and browserless.
2025-06-06: Deploy assistant and browserless.
2025-06-06: Deploy assistant and browserless.
2025-07-21: Deploy executor partition 1.