mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(deps) Upgrade crawler commons to fix robots.txt-parser bug
This commit is contained in:
@@ -1,11 +1,11 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import crawlercommons.utils.Strings;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.domclassifier.DomSampleClassification;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -169,7 +169,7 @@ public class DocumentValuator {
|
||||
|
||||
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
|
||||
penalty += 0.49;
|
||||
} else if (!Strings.isBlank(srcAttr)) {
|
||||
} else if (!StringUtils.isBlank(srcAttr)) {
|
||||
penalty += 1;
|
||||
} else {
|
||||
var wt = el.wholeText();
|
||||
|
@@ -198,7 +198,7 @@ dependencyResolutionManagement {
|
||||
library('ffi','com.github.jnr','jnr-ffi').version('2.2.12')
|
||||
library('databind','com.fasterxml.jackson.core','jackson-databind').version('2.13.5')
|
||||
|
||||
library('crawlercommons', 'com.github.crawler-commons', 'crawler-commons').version('1.3')
|
||||
library('crawlercommons', 'com.github.crawler-commons', 'crawler-commons').version('1.5')
|
||||
|
||||
library('stanford.corenlp','edu.stanford.nlp','stanford-corenlp').version('4.5.5')
|
||||
library('opennlp','org.apache.opennlp','opennlp-tools').version('2.3.3')
|
||||
|
Reference in New Issue
Block a user