mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
6 Commits
deploy-028
...
deploy-029
Author | SHA1 | Date | |
---|---|---|---|
|
8aad253cf6 | ||
|
556d7af9dc | ||
|
b7a5219ed3 | ||
|
a23ec521fe | ||
|
fff3babc6d | ||
|
b2bfb8217c |
@@ -41,7 +41,7 @@ public class DomSampleClient {
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||
logger.error("Failed to fetch DOM sample");
|
||||
logger.error("Failed to fetch DOM sample", sre);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
@@ -15,6 +15,7 @@
|
||||
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
|
||||
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
|
||||
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
|
||||
<classifier target="url-regex" rule="tracking">^/pixel\.gif$</classifier>
|
||||
<classifier target="url-regex" rule="ads">/pagead/</classifier>
|
||||
<classifier target="url-regex" rule="ads">/google-ads/</classifier>
|
||||
|
||||
|
@@ -98,10 +98,13 @@ public class DomainProcessor {
|
||||
return domSampleClient
|
||||
.getSampleAsync(domainName, domSampleExecutor)
|
||||
.thenApply(domSampleClassifier::classifySample)
|
||||
.handle((a,b) ->
|
||||
Objects.requireNonNullElseGet(a,
|
||||
() -> EnumSet.of(DomSampleClassification.UNCLASSIFIED)))
|
||||
.get();
|
||||
.handle((a,b) -> {
|
||||
if (b != null) {
|
||||
logger.warn("Exception when fetching sample data", b);
|
||||
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
|
||||
}
|
||||
return a;
|
||||
}).get();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
|
@@ -161,7 +161,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||
|
||||
|
||||
if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
|
||||
features.add(HtmlFeature.SHORT_DOCUMENT);
|
||||
}
|
||||
|
@@ -115,7 +115,9 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
ret.quality = -5;
|
||||
|
||||
ret.features = Set.of(HtmlFeature.PDF);
|
||||
ret.features = new HashSet<>(); // must be mutable!
|
||||
ret.features.add(HtmlFeature.PDF);
|
||||
|
||||
ret.description = getDescription(doc);
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
|
||||
|
@@ -11,3 +11,4 @@
|
||||
2025-05-17: Redeploy all.
|
||||
2025-05-28: Deploy assistant and browserless.
|
||||
2025-06-06: Deploy assistant and browserless.
|
||||
2025-07-21: Deploy executor partition 1.
|
Reference in New Issue
Block a user