1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Viktor Lofgren
23c818281b (converter) Reduce DomSample logging for NOT_FOUND 2025-07-21 13:37:55 +02:00
Viktor Lofgren
8aad253cf6 (converter) Add more logging around dom sample data retrieval errors 2025-07-21 13:26:38 +02:00
Viktor Lofgren
556d7af9dc Reapply "(grpc) Use grpc-netty instead of grpc-netty-shaded"
This reverts commit b7a5219ed3.
2025-07-21 13:23:32 +02:00
Viktor Lofgren
b7a5219ed3 Revert "(grpc) Use grpc-netty instead of grpc-netty-shaded"
Reverting this change to see if it's the cause of some instability issues observed.
2025-07-21 13:10:41 +02:00
Viktor Lofgren
a23ec521fe (converter) Ensure features is mutable on DetailsWithWords as this is assumed later 2025-07-21 12:50:04 +02:00
Viktor Lofgren
fff3babc6d (classier) Add rule for */pixel.gif as likely tracking pixels 2025-07-21 12:35:57 +02:00
Viktor Lofgren
b2bfb8217c (special) Trigger CD run 2025-07-21 12:28:24 +02:00
7 changed files with 19 additions and 8 deletions

View File

@@ -41,7 +41,7 @@ public class DomSampleClient {
} }
catch (StatusRuntimeException sre) { catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) { if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample"); logger.error("Failed to fetch DOM sample", sre);
} }
return Optional.empty(); return Optional.empty();
} }

View File

@@ -90,6 +90,7 @@ dependencies {
implementation libs.commons.lang3 implementation libs.commons.lang3
implementation libs.commons.compress implementation libs.commons.compress
implementation libs.sqlite implementation libs.sqlite
implementation libs.bundles.grpc
implementation libs.bundles.httpcomponents implementation libs.bundles.httpcomponents

View File

@@ -15,6 +15,7 @@
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier> <classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier> <classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier> <classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
<classifier target="url-regex" rule="tracking">^/pixel\.gif$</classifier>
<classifier target="url-regex" rule="ads">/pagead/</classifier> <classifier target="url-regex" rule="ads">/pagead/</classifier>
<classifier target="url-regex" rule="ads">/google-ads/</classifier> <classifier target="url-regex" rule="ads">/google-ads/</classifier>

View File

@@ -1,6 +1,8 @@
package nu.marginalia.converting.processor; package nu.marginalia.converting.processor;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.StatusRuntimeException;
import nu.marginalia.api.domsample.DomSampleClient; import nu.marginalia.api.domsample.DomSampleClient;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSource;
@@ -98,10 +100,15 @@ public class DomainProcessor {
return domSampleClient return domSampleClient
.getSampleAsync(domainName, domSampleExecutor) .getSampleAsync(domainName, domSampleExecutor)
.thenApply(domSampleClassifier::classifySample) .thenApply(domSampleClassifier::classifySample)
.handle((a,b) -> .handle((a,b) -> {
Objects.requireNonNullElseGet(a, if (b != null) {
() -> EnumSet.of(DomSampleClassification.UNCLASSIFIED))) if (!(b instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
.get(); logger.warn("Exception when fetching sample data", b);
}
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
}
return a;
}).get();
} }
@Nullable @Nullable

View File

@@ -161,7 +161,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld); final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) { if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
features.add(HtmlFeature.SHORT_DOCUMENT); features.add(HtmlFeature.SHORT_DOCUMENT);
} }

View File

@@ -115,7 +115,9 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.quality = -5; ret.quality = -5;
ret.features = Set.of(HtmlFeature.PDF); ret.features = new HashSet<>(); // must be mutable!
ret.features.add(HtmlFeature.PDF);
ret.description = getDescription(doc); ret.description = getDescription(doc);
ret.hashCode = dld.localitySensitiveHashCode(); ret.hashCode = dld.localitySensitiveHashCode();

View File

@@ -10,4 +10,5 @@
2025-05-08: Deploy assistant. 2025-05-08: Deploy assistant.
2025-05-17: Redeploy all. 2025-05-17: Redeploy all.
2025-05-28: Deploy assistant and browserless. 2025-05-28: Deploy assistant and browserless.
2025-06-06: Deploy assistant and browserless. 2025-06-06: Deploy assistant and browserless.
2025-07-21: Deploy executor partition 1.