1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(deps) Upgrade crawler commons to fix robots.txt-parser bug

This commit is contained in:
Viktor Lofgren
2025-08-15 00:11:44 +02:00
parent 2fd2710355
commit 291ff0c4de
2 changed files with 3 additions and 3 deletions

View File

@@ -1,11 +1,11 @@
package nu.marginalia.converting.processor.logic;
import crawlercommons.utils.Strings;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.domclassifier.DomSampleClassification;
import nu.marginalia.model.DocumentFormat;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawldata.CrawledDocument;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -169,7 +169,7 @@ public class DocumentValuator {
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
penalty += 0.49;
} else if (!Strings.isBlank(srcAttr)) {
} else if (!StringUtils.isBlank(srcAttr)) {
penalty += 1;
} else {
var wt = el.wholeText();

View File

@@ -198,7 +198,7 @@ dependencyResolutionManagement {
library('ffi','com.github.jnr','jnr-ffi').version('2.2.12')
library('databind','com.fasterxml.jackson.core','jackson-databind').version('2.13.5')
library('crawlercommons', 'com.github.crawler-commons', 'crawler-commons').version('1.3')
library('crawlercommons', 'com.github.crawler-commons', 'crawler-commons').version('1.5')
library('stanford.corenlp','edu.stanford.nlp','stanford-corenlp').version('4.5.5')
library('opennlp','org.apache.opennlp','opennlp-tools').version('2.3.3')