mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
46 Commits
deploy-016
...
deploy-018
Author | SHA1 | Date | |
---|---|---|---|
|
a7a3d85be9 | ||
|
306232fb54 | ||
|
5aef844f0d | ||
|
d56b5c828a | ||
|
ab58a4636f | ||
|
00be269238 | ||
|
879e6a9424 | ||
|
fba3455732 | ||
|
14283da7f5 | ||
|
93df4d1fc0 | ||
|
b12a0b998c | ||
|
3b6f4e321b | ||
|
8428111771 | ||
|
e9fd4415ef | ||
|
4c95c3dcad | ||
|
c5281536fb | ||
|
4431dae7ac | ||
|
4df4d0a7a8 | ||
|
9f05083b94 | ||
|
fc92e9b9c0 | ||
|
328fb5d927 | ||
|
36889950e8 | ||
|
c96a94878b | ||
|
1c57d7d73a | ||
|
a443d22356 | ||
|
aa59d4afa4 | ||
|
df0f18d0e7 | ||
|
0819d46f97 | ||
|
5e2b63473e | ||
|
f9590703f1 | ||
|
f12fc11337 | ||
|
c309030184 | ||
|
fd5af01629 | ||
|
d4c43c7a79 | ||
|
18700e1919 | ||
|
120b431998 | ||
|
71dad99326 | ||
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 |
@@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.model;
|
||||||
|
|
||||||
|
public enum DocumentFormat {
|
||||||
|
PLAIN(0, 1, "text"),
|
||||||
|
PDF(0, 1, "pdf"),
|
||||||
|
UNKNOWN(0, 1, "???"),
|
||||||
|
HTML123(0, 1, "html"),
|
||||||
|
HTML4(-0.1, 1.05, "html"),
|
||||||
|
XHTML(-0.1, 1.05, "html"),
|
||||||
|
HTML5(0.5, 1.1, "html");
|
||||||
|
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double offset;
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double scale;
|
||||||
|
public final String shortFormat;
|
||||||
|
|
||||||
|
DocumentFormat(double offset, double scale, String shortFormat) {
|
||||||
|
this.offset = offset;
|
||||||
|
this.scale = scale;
|
||||||
|
this.shortFormat = shortFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -28,6 +28,8 @@ public enum HtmlFeature {
|
|||||||
|
|
||||||
GA_SPAM("special:gaspam"),
|
GA_SPAM("special:gaspam"),
|
||||||
|
|
||||||
|
PDF("format:pdf"),
|
||||||
|
|
||||||
/** For fingerprinting and ranking */
|
/** For fingerprinting and ranking */
|
||||||
OPENGRAPH("special:opengraph"),
|
OPENGRAPH("special:opengraph"),
|
||||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||||
|
@@ -1,22 +0,0 @@
|
|||||||
package nu.marginalia.model.html;
|
|
||||||
|
|
||||||
// This class really doesn't belong anywhere, but will squat here for now
|
|
||||||
public enum HtmlStandard {
|
|
||||||
PLAIN(0, 1),
|
|
||||||
UNKNOWN(0, 1),
|
|
||||||
HTML123(0, 1),
|
|
||||||
HTML4(-0.1, 1.05),
|
|
||||||
XHTML(-0.1, 1.05),
|
|
||||||
HTML5(0.5, 1.1);
|
|
||||||
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double offset;
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double scale;
|
|
||||||
|
|
||||||
HtmlStandard(double offset, double scale) {
|
|
||||||
this.offset = offset;
|
|
||||||
this.scale = scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@@ -9,7 +9,7 @@ public enum DocumentFlags {
|
|||||||
GeneratorForum,
|
GeneratorForum,
|
||||||
GeneratorWiki,
|
GeneratorWiki,
|
||||||
Sideloaded,
|
Sideloaded,
|
||||||
Unused7,
|
PdfFile,
|
||||||
Unused8,
|
Unused8,
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@@ -122,6 +122,11 @@ public class JoobyService {
|
|||||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||||
options.setCompressionLevel(1);
|
options.setCompressionLevel(1);
|
||||||
|
|
||||||
|
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||||
|
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||||
|
// scenario
|
||||||
|
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||||
|
|
||||||
|
|
||||||
jooby.setServerOptions(options);
|
jooby.setServerOptions(options);
|
||||||
|
|
||||||
|
@@ -3,11 +3,18 @@
|
|||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="Console" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||||
<Filters>
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
ignoreExceptions="false">
|
ignoreExceptions="false">
|
||||||
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
||||||
@@ -15,6 +22,7 @@
|
|||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
<SizeBasedTriggeringPolicy size="10MB" />
|
<SizeBasedTriggeringPolicy size="10MB" />
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
@@ -31,9 +39,11 @@
|
|||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="Console"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="LogToFile"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
|
@@ -1,13 +1,51 @@
|
|||||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||||
<Appenders>
|
<Appenders>
|
||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
<Filters>
|
<Filters>
|
||||||
|
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
ignoreExceptions="false">
|
ignoreExceptions="false">
|
||||||
<PatternLayout>
|
<PatternLayout>
|
||||||
@@ -34,9 +72,14 @@
|
|||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
<AppenderRef ref="ConsoleError"/>
|
||||||
|
<AppenderRef ref="ConsoleFatal"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="LogToFile"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
|
@@ -1,15 +1,50 @@
|
|||||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||||
<Appenders>
|
<Appenders>
|
||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
<AppenderRef ref="ConsoleError"/>
|
||||||
|
<AppenderRef ref="ConsoleFatal"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
</Configuration>
|
</Configuration>
|
@@ -79,9 +79,17 @@ public class SimpleFeedParser {
|
|||||||
if (!link.isBlank())
|
if (!link.isBlank())
|
||||||
break;
|
break;
|
||||||
var tag = element.getElementsByTag(attr).first();
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
|
||||||
if (tag != null) {
|
if (tag != null) {
|
||||||
link = tag.text();
|
String linkText = tag.text();
|
||||||
|
|
||||||
|
if (linkText.isBlank()) {
|
||||||
|
linkText = tag.attr("href");
|
||||||
|
}
|
||||||
|
|
||||||
|
link = linkText;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.add(new ItemData(title, description, link, pubDate));
|
ret.add(new ItemData(title, description, link, pubDate));
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results;
|
package nu.marginalia.api.searchquery.model.results;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@@ -161,4 +162,14 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
|||||||
public String toString() {
|
public String toString() {
|
||||||
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getShortFormat() {
|
||||||
|
try {
|
||||||
|
var df = DocumentFormat.valueOf(format);
|
||||||
|
return df.shortFormat;
|
||||||
|
}
|
||||||
|
catch (IllegalArgumentException e) {
|
||||||
|
return DocumentFormat.UNKNOWN.shortFormat;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -62,6 +62,7 @@ dependencies {
|
|||||||
implementation libs.jwarc
|
implementation libs.jwarc
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
|
implementation libs.pdfbox
|
||||||
|
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
@@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.converting.model;
|
package nu.marginalia.converting.model;
|
||||||
|
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
@@ -21,7 +21,7 @@ public class ProcessedDocumentDetails {
|
|||||||
public long hashCode;
|
public long hashCode;
|
||||||
|
|
||||||
public Set<HtmlFeature> features;
|
public Set<HtmlFeature> features;
|
||||||
public HtmlStandard standard;
|
public DocumentFormat format;
|
||||||
|
|
||||||
public List<EdgeUrl> linksInternal;
|
public List<EdgeUrl> linksInternal;
|
||||||
public List<EdgeUrl> linksExternal;
|
public List<EdgeUrl> linksExternal;
|
||||||
@@ -30,6 +30,6 @@ public class ProcessedDocumentDetails {
|
|||||||
public GeneratorType generator;
|
public GeneratorType generator;
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.format + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
|
|||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
||||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||||
|
import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
|
||||||
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@@ -33,7 +34,8 @@ public class DocumentProcessor {
|
|||||||
private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||||
"application/xhtml",
|
"application/xhtml",
|
||||||
"text/html",
|
"text/html",
|
||||||
"text/plain");
|
"text/plain",
|
||||||
|
"application/pdf");
|
||||||
|
|
||||||
|
|
||||||
private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
|
private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
|
||||||
@@ -42,12 +44,14 @@ public class DocumentProcessor {
|
|||||||
@Inject
|
@Inject
|
||||||
public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
|
public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
|
||||||
PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
|
PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
|
||||||
|
PdfDocumentProcessorPlugin pdfDocumentProcessorPlugin,
|
||||||
AnchorTextKeywords anchorTextKeywords)
|
AnchorTextKeywords anchorTextKeywords)
|
||||||
{
|
{
|
||||||
this.anchorTextKeywords = anchorTextKeywords;
|
this.anchorTextKeywords = anchorTextKeywords;
|
||||||
|
|
||||||
processorPlugins.add(htmlDocumentProcessorPlugin);
|
processorPlugins.add(htmlDocumentProcessorPlugin);
|
||||||
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
||||||
|
processorPlugins.add(pdfDocumentProcessorPlugin);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ProcessedDocument process(CrawledDocument crawledDocument,
|
public ProcessedDocument process(CrawledDocument crawledDocument,
|
||||||
|
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.logic;
|
|||||||
|
|
||||||
import crawlercommons.utils.Strings;
|
import crawlercommons.utils.Strings;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@@ -17,7 +17,7 @@ import java.util.Set;
|
|||||||
public class DocumentValuator {
|
public class DocumentValuator {
|
||||||
|
|
||||||
public double getQuality(CrawledDocument crawledDocument,
|
public double getQuality(CrawledDocument crawledDocument,
|
||||||
HtmlStandard htmlStandard,
|
DocumentFormat htmlStandard,
|
||||||
Document parsedDocument,
|
Document parsedDocument,
|
||||||
int textLength) throws DisqualifiedException {
|
int textLength) throws DisqualifiedException {
|
||||||
|
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic;
|
||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.DocumentType;
|
import org.jsoup.nodes.DocumentType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -12,54 +12,54 @@ public class HtmlStandardExtractor {
|
|||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
||||||
|
|
||||||
public static HtmlStandard parseDocType(DocumentType docType) {
|
public static DocumentFormat parseDocType(DocumentType docType) {
|
||||||
if (null == docType) {
|
if (null == docType) {
|
||||||
return HtmlStandard.UNKNOWN;
|
return DocumentFormat.UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
String publicId = docType.publicId();
|
String publicId = docType.publicId();
|
||||||
if (Strings.isNullOrEmpty(publicId))
|
if (Strings.isNullOrEmpty(publicId))
|
||||||
return HtmlStandard.HTML5;
|
return DocumentFormat.HTML5;
|
||||||
|
|
||||||
publicId = publicId.toUpperCase();
|
publicId = publicId.toUpperCase();
|
||||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
}
|
}
|
||||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
}
|
}
|
||||||
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
||||||
return HtmlStandard.XHTML;
|
return DocumentFormat.XHTML;
|
||||||
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
||||||
return HtmlStandard.XHTML;
|
return DocumentFormat.XHTML;
|
||||||
if (publicId.startsWith("-//W3C//DTD HTML"))
|
if (publicId.startsWith("-//W3C//DTD HTML"))
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
|
|
||||||
logger.debug("Unknown publicID standard {}", publicId);
|
logger.debug("Unknown publicID standard {}", publicId);
|
||||||
return HtmlStandard.UNKNOWN;
|
return DocumentFormat.UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HtmlStandard sniffHtmlStandard(Document parsed) {
|
public static DocumentFormat sniffHtmlStandard(Document parsed) {
|
||||||
int html4Attributes = 0;
|
int html4Attributes = 0;
|
||||||
int html5Attributes = 0;
|
int html5Attributes = 0;
|
||||||
|
|
||||||
@@ -73,11 +73,11 @@ public class HtmlStandardExtractor {
|
|||||||
html4Attributes++;
|
html4Attributes++;
|
||||||
}
|
}
|
||||||
if (html5Attributes > 0) {
|
if (html5Attributes > 0) {
|
||||||
return HtmlStandard.HTML5;
|
return DocumentFormat.HTML5;
|
||||||
}
|
}
|
||||||
if (html4Attributes > 0) {
|
if (html4Attributes > 0) {
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
}
|
}
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -7,11 +7,11 @@ import nu.marginalia.keyword.LinkTexts;
|
|||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.filter.LanguageFilter;
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -73,7 +73,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public MetaTagsBuilder addFormat(HtmlStandard standard) {
|
public MetaTagsBuilder addFormat(DocumentFormat standard) {
|
||||||
|
|
||||||
add("format", standard);
|
add("format", standard);
|
||||||
|
|
||||||
|
@@ -25,12 +25,12 @@ import nu.marginalia.language.model.DocumentLanguageData;
|
|||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
import nu.marginalia.link_parser.FeedExtractor;
|
import nu.marginalia.link_parser.FeedExtractor;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@@ -137,8 +137,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
|
|
||||||
final int length = getLength(doc);
|
final int length = getLength(doc);
|
||||||
final HtmlStandard standard = getHtmlStandard(doc);
|
final DocumentFormat format = getDocumentFormat(doc);
|
||||||
final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
|
final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);
|
||||||
|
|
||||||
if (isDisqualified(documentClass, url, quality, doc.title())) {
|
if (isDisqualified(documentClass, url, quality, doc.title())) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||||
@@ -152,7 +152,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
var ret = new ProcessedDocumentDetails();
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
ret.length = length;
|
ret.length = length;
|
||||||
ret.standard = standard;
|
ret.format = format;
|
||||||
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
|
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
|
||||||
|
|
||||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||||
@@ -161,7 +161,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
ret.quality = documentValuator.adjustQuality(quality, features);
|
ret.quality = documentValuator.adjustQuality(quality, features);
|
||||||
ret.hashCode = dld.localitySensitiveHashCode();
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, format, true);
|
||||||
|
|
||||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||||
|
|
||||||
@@ -180,7 +180,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
.addPubDate(pubDate)
|
.addPubDate(pubDate)
|
||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(features)
|
.addFeatures(features)
|
||||||
.addFormat(standard)
|
.addFormat(format)
|
||||||
.addGenerator(generatorParts.keywords())
|
.addGenerator(generatorParts.keywords())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
@@ -316,12 +316,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
return linkTerms;
|
return linkTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
private HtmlStandard getHtmlStandard(Document doc) {
|
private DocumentFormat getDocumentFormat(Document doc) {
|
||||||
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
DocumentFormat format = HtmlStandardExtractor.parseDocType(doc.documentType());
|
||||||
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
if (DocumentFormat.UNKNOWN.equals(format)) {
|
||||||
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
||||||
}
|
}
|
||||||
return htmlStandard;
|
return format;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getLength(Document doc) {
|
private int getLength(Document doc) {
|
||||||
|
@@ -0,0 +1,286 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||||
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||||
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
|
import org.apache.pdfbox.text.HeadingAwarePDFTextStripper;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||||
|
|
||||||
|
private final int maxTitleLength;
|
||||||
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
|
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||||
|
private final DocumentLengthLogic documentLengthLogic;
|
||||||
|
private final DefaultSpecialization defaultSpecialization;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PdfDocumentProcessorPlugin.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||||
|
LanguageFilter languageFilter,
|
||||||
|
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||||
|
DocumentKeywordExtractor keywordExtractor,
|
||||||
|
DocumentLengthLogic documentLengthLogic,
|
||||||
|
DefaultSpecialization defaultSpecialization)
|
||||||
|
|
||||||
|
{
|
||||||
|
super(languageFilter);
|
||||||
|
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||||
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
|
this.maxTitleLength = maxTitleLength;
|
||||||
|
this.keywordExtractor = keywordExtractor;
|
||||||
|
this.defaultSpecialization = defaultSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isApplicable(CrawledDocument doc) {
|
||||||
|
String contentType = doc.contentType.toLowerCase();
|
||||||
|
|
||||||
|
if (contentType.equals("application/pdf"))
|
||||||
|
return true;
|
||||||
|
if (contentType.startsWith("application/pdf;")) // charset=blabla
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||||
|
LinkTexts linkTexts,
|
||||||
|
DocumentClass documentClass)
|
||||||
|
throws DisqualifiedException, URISyntaxException, IOException {
|
||||||
|
|
||||||
|
String documentBody = crawledDocument.documentBody();
|
||||||
|
|
||||||
|
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||||
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||||
|
|
||||||
|
|
||||||
|
Document doc;
|
||||||
|
try {
|
||||||
|
doc = convertPdfToHtml(crawledDocument.documentBodyBytes);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("Failed to convert PDF file {} - {}", url, e.getMessage());
|
||||||
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||||
|
|
||||||
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
|
documentLengthLogic.validateLength(dld, 1.0);
|
||||||
|
|
||||||
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
|
ret.length = documentBody.length();
|
||||||
|
|
||||||
|
ret.format = DocumentFormat.PDF;
|
||||||
|
ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
|
||||||
|
|
||||||
|
ret.quality = -5;
|
||||||
|
|
||||||
|
ret.features = Set.of(HtmlFeature.PDF);
|
||||||
|
ret.description = getDescription(doc);
|
||||||
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
|
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
||||||
|
|
||||||
|
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PdfFile);
|
||||||
|
|
||||||
|
ret.metadata = new DocumentMetadata(
|
||||||
|
documentLengthLogic.getEncodedAverageLength(dld),
|
||||||
|
pubDate.yearByte(),
|
||||||
|
(int) -ret.quality,
|
||||||
|
documentFlags);
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||||
|
|
||||||
|
var tagWords = new MetaTagsBuilder()
|
||||||
|
.addPubDate(pubDate)
|
||||||
|
.addUrl(url)
|
||||||
|
.addFeatures(ret.features)
|
||||||
|
.addFormat(ret.format)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
|
||||||
|
if (pubDate.hasYear()) {
|
||||||
|
ret.pubYear = pubDate.year();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* These are assumed to be populated */
|
||||||
|
ret.linksInternal = new ArrayList<>();
|
||||||
|
ret.linksExternal = new ArrayList<>();
|
||||||
|
|
||||||
|
return new DetailsWithWords(ret, words);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getDescription(Document doc) {
|
||||||
|
int cnt = 0;
|
||||||
|
boolean useNext = false;
|
||||||
|
for (var ptag : doc.getElementsByTag("p")) {
|
||||||
|
String text = ptag.text();
|
||||||
|
|
||||||
|
// Many academic documents have an abstract at the start of the document,
|
||||||
|
// which makes a nice summary. Though they tend to bleed into the text,
|
||||||
|
// so we check for the word "Abstract" at the start of the paragraph.
|
||||||
|
|
||||||
|
if (text.startsWith("Abstract ")) {
|
||||||
|
return StringUtils.abbreviate(text.substring("Abstract ".length()), "...", 255);
|
||||||
|
}
|
||||||
|
else if (text.equals("Abstract")) {
|
||||||
|
useNext = true;
|
||||||
|
}
|
||||||
|
else if (useNext) {
|
||||||
|
return StringUtils.abbreviate(text, "...", 255);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (++cnt > 15) { // Don't scan the entire document
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to the default specialization
|
||||||
|
return defaultSpecialization.getSummary(doc, Set.of());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Convert the provided PDF bytes into a HTML rendering that can be fed
|
||||||
|
* to the HTML processor.
|
||||||
|
*/
|
||||||
|
Document convertPdfToHtml(byte[] pdfBytes) throws IOException {
|
||||||
|
try (var doc = Loader.loadPDF(pdfBytes)) {
|
||||||
|
String docMetaTitle = Objects.requireNonNullElse(doc.getDocumentInformation().getTitle(), "");
|
||||||
|
|
||||||
|
var stripper = new HeadingAwarePDFTextStripper();
|
||||||
|
stripper.setStartPage(1);
|
||||||
|
stripper.setSortByPosition(true);
|
||||||
|
stripper.setWordSeparator(" ");
|
||||||
|
|
||||||
|
// Increase the tolerance for line spacing to deal better with paragraphs.
|
||||||
|
stripper.setDropThreshold(5f);
|
||||||
|
|
||||||
|
stripper.setPageStart("<div>");
|
||||||
|
stripper.setParagraphStart("<p>");
|
||||||
|
stripper.setParagraphEnd("</p>\n");
|
||||||
|
stripper.setPageEnd("</div>\n");
|
||||||
|
stripper.setHeadingStart("<h1>");
|
||||||
|
stripper.setHeadingEnd("</h1>\n");
|
||||||
|
stripper.setLineSeparator("\n");
|
||||||
|
|
||||||
|
String text = stripper.getText(doc);
|
||||||
|
|
||||||
|
StringBuilder htmlBuilder = new StringBuilder(text.length() + 1024);
|
||||||
|
htmlBuilder.append("<html><body>")
|
||||||
|
.append(text)
|
||||||
|
.append("</body></html>");
|
||||||
|
|
||||||
|
var parsed = Jsoup.parse(htmlBuilder.toString());
|
||||||
|
|
||||||
|
repairDOM(parsed);
|
||||||
|
|
||||||
|
for (var heading : parsed.getElementsByTag("h1")) {
|
||||||
|
String headingText = heading.text();
|
||||||
|
if (headingText.length() > 2) {
|
||||||
|
parsed.title(headingText);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (parsed.title().isEmpty()) {
|
||||||
|
// Prefer setting the title to the first paragraph in the
|
||||||
|
// document, as this is almost always correct. Otherwise,
|
||||||
|
// we fall back on the metadata title, which is almost always
|
||||||
|
// useless
|
||||||
|
|
||||||
|
var firstP = parsed.getElementsByTag("p").first();
|
||||||
|
if (firstP != null) parsed.title(firstP.text());
|
||||||
|
else parsed.title(docMetaTitle);
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Repair the DOM to remove some common issues with PDF conversion,
|
||||||
|
* including empty paragraphs, and multiline headers that are split into multiple
|
||||||
|
* conescutive h1 tags.
|
||||||
|
*/
|
||||||
|
private void repairDOM(Document parsed) {
|
||||||
|
|
||||||
|
// <p><h1>...</h1></p> -> <h1>...</h1>
|
||||||
|
parsed.getElementsByTag("h1").forEach(h1 -> {
|
||||||
|
var parent = h1.parent();
|
||||||
|
if (parent == null || !"p".equals(parent.tagName())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parent.childrenSize() == 1) {
|
||||||
|
parent.replaceWith(h1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove empty <p> tags
|
||||||
|
parsed.getElementsByTag("p").forEach(p -> {
|
||||||
|
if (p.childrenSize() == 0 && !p.hasText()) {
|
||||||
|
p.remove();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// <h1>...</h1><h1>...</h1> -> <h1>...</h1>
|
||||||
|
parsed.getElementsByTag("h1").forEach(h1 -> {
|
||||||
|
var nextSibling = h1.nextElementSibling();
|
||||||
|
if (nextSibling == null || !"h1".equals(nextSibling.tagName())) {
|
||||||
|
return; // Short-circuit to avoid unnecessary work
|
||||||
|
}
|
||||||
|
|
||||||
|
StringJoiner joiner = new StringJoiner(" ");
|
||||||
|
joiner.add(h1.text());
|
||||||
|
|
||||||
|
for (var sibling : h1.nextElementSiblings()) {
|
||||||
|
if (!"h1".equals(sibling.tagName()))
|
||||||
|
break;
|
||||||
|
joiner.add(sibling.text());
|
||||||
|
sibling.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
h1.text(joiner.toString());
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -13,10 +13,10 @@ import nu.marginalia.keyword.LinkTexts;
|
|||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.filter.LanguageFilter;
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
@@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
ret.length = documentBody.length();
|
ret.length = documentBody.length();
|
||||||
|
|
||||||
ret.standard = HtmlStandard.PLAIN;
|
ret.format = DocumentFormat.PLAIN;
|
||||||
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
||||||
|
|
||||||
ret.quality = -1;
|
ret.quality = -1;
|
||||||
@@ -113,7 +113,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
.addPubDate(pubDate)
|
.addPubDate(pubDate)
|
||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(ret.features)
|
.addFeatures(ret.features)
|
||||||
.addFormat(ret.standard)
|
.addFormat(ret.format)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
words.addAllSyntheticTerms(tagWords);
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
@@ -1,12 +1,13 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
import nu.marginalia.model.DocumentFormat;
|
||||||
|
|
||||||
public class PubDateFromHtmlStandard {
|
public class PubDateFromHtmlStandard {
|
||||||
/** Used to bias pub date heuristics */
|
/** Used to bias pub date heuristics */
|
||||||
public static int blindGuess(HtmlStandard standard) {
|
public static int blindGuess(DocumentFormat format) {
|
||||||
return switch (standard) {
|
return switch (format) {
|
||||||
case PLAIN -> 1993;
|
case PLAIN -> 1993;
|
||||||
|
case PDF -> 2010;
|
||||||
case HTML123 -> 1997;
|
case HTML123 -> 1997;
|
||||||
case HTML4, XHTML -> 2006;
|
case HTML4, XHTML -> 2006;
|
||||||
case HTML5 -> 2018;
|
case HTML5 -> 2018;
|
||||||
@@ -21,8 +22,8 @@ public class PubDateFromHtmlStandard {
|
|||||||
* Discovering publication year involves a lot of guesswork, this helps
|
* Discovering publication year involves a lot of guesswork, this helps
|
||||||
* keep the guesses relatively sane.
|
* keep the guesses relatively sane.
|
||||||
*/
|
*/
|
||||||
public static boolean isGuessPlausible(HtmlStandard standard, int year) {
|
public static boolean isGuessPlausible(DocumentFormat format, int year) {
|
||||||
switch (standard) {
|
switch (format) {
|
||||||
case HTML123:
|
case HTML123:
|
||||||
return year <= 2000;
|
return year <= 2000;
|
||||||
case XHTML:
|
case XHTML:
|
||||||
|
@@ -1,14 +1,14 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
public interface PubDateHeuristic {
|
public interface PubDateHeuristic {
|
||||||
|
|
||||||
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
|
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard);
|
||||||
}
|
}
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
|
|
||||||
import java.time.DateTimeException;
|
import java.time.DateTimeException;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
@@ -26,7 +26,7 @@ public class PubDateParser {
|
|||||||
.filter(PubDateParser::validateDate);
|
.filter(PubDateParser::validateDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
|
public static Optional<PubDate> attemptParseDate(String date, DocumentFormat standard) {
|
||||||
return Optional.ofNullable(date)
|
return Optional.ofNullable(date)
|
||||||
.filter(str -> str.length() >= 4 && str.length() < 32)
|
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||||
.flatMap(str ->
|
.flatMap(str ->
|
||||||
@@ -81,7 +81,7 @@ public class PubDateParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
|
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, DocumentFormat standard) {
|
||||||
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
||||||
|
|
||||||
var matcher = yearPattern.matcher(maybe);
|
var matcher = yearPattern.matcher(maybe);
|
||||||
@@ -135,7 +135,7 @@ public class PubDateParser {
|
|||||||
return (max + min) / 2;
|
return (max + min) / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int guessYear(HtmlStandard standard) {
|
public static int guessYear(DocumentFormat standard) {
|
||||||
// Create some jitter to avoid having documents piling up in the same four years
|
// Create some jitter to avoid having documents piling up in the same four years
|
||||||
// as this would make searching in those years disproportionately useless
|
// as this would make searching in those years disproportionately useless
|
||||||
|
|
||||||
|
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.pubdate;
|
|||||||
|
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@@ -38,7 +38,7 @@ public class PubDateSniffer {
|
|||||||
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||||
}
|
}
|
||||||
|
|
||||||
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
|
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard, boolean runExpensive) {
|
||||||
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
||||||
|
|
||||||
for (var heuristic : heuristics) {
|
for (var heuristic : heuristics) {
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@@ -19,7 +19,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
|
|
||||||
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final HtmlStandard htmlStandard;
|
private final DocumentFormat htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitorPass(DocumentFormat htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -135,7 +135,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void parse(String text) {
|
private void parse(String text) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
if (htmlStandard == DocumentFormat.UNKNOWN) {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstring(text)
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
||||||
|
@@ -5,9 +5,9 @@ import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Node;
|
import org.jsoup.nodes.Node;
|
||||||
@@ -19,7 +19,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
|
|
||||||
private static class DateExtractingNodeVisitor implements NodeFilter {
|
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final HtmlStandard htmlStandard;
|
private final DocumentFormat htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitor(DocumentFormat htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,7 +73,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void parse(String text) {
|
private void parse(String text) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
if (htmlStandard == DocumentFormat.UNKNOWN) {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstring(text)
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,8 +14,8 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN)
|
if (htmlStandard == DocumentFormat.UNKNOWN)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// HTML5, alternative approach
|
// HTML5, alternative approach
|
||||||
for (var tag : document.select("time")) {
|
for (var tag : document.select("time")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// HTML5
|
// HTML5
|
||||||
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
||||||
|
@@ -8,9 +8,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@@ -21,7 +21,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
||||||
var maybeDate = parseLdJson(tag.data())
|
var maybeDate = parseLdJson(tag.data())
|
||||||
.flatMap(PubDateParser::attemptParseDate);
|
.flatMap(PubDateParser::attemptParseDate);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -15,7 +15,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
List<String> lastModified = headers.get("last-modified");
|
List<String> lastModified = headers.get("last-modified");
|
||||||
if (lastModified.isEmpty())
|
if (lastModified.isEmpty())
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
|
|
||||||
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// OG
|
// OG
|
||||||
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -21,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
|||||||
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
final String urlString = url.path;
|
final String urlString = url.path;
|
||||||
|
|
||||||
var matcher = yearUrlPattern.matcher(urlString);
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -19,7 +19,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
|
||||||
Document document, HtmlStandard htmlStandard) {
|
Document document, DocumentFormat htmlStandard) {
|
||||||
final String urlString = url.path;
|
final String urlString = url.path;
|
||||||
|
|
||||||
var matcher = yearUrlPattern.matcher(urlString);
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
@@ -8,12 +8,12 @@ import nu.marginalia.converting.model.ProcessedDocument;
|
|||||||
import nu.marginalia.converting.processor.DocumentClass;
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
@@ -83,7 +83,7 @@ public class SideloaderProcessing {
|
|||||||
// that we can't get from the sideloaded data since it's
|
// that we can't get from the sideloaded data since it's
|
||||||
// so stripped down
|
// so stripped down
|
||||||
|
|
||||||
ret.details.standard = HtmlStandard.HTML5;
|
ret.details.format = DocumentFormat.HTML5;
|
||||||
ret.details.pubYear = pubYear;
|
ret.details.pubYear = pubYear;
|
||||||
ret.details.features.add(HtmlFeature.JS);
|
ret.details.features.add(HtmlFeature.JS);
|
||||||
ret.details.features.add(HtmlFeature.TRACKING);
|
ret.details.features.add(HtmlFeature.TRACKING);
|
||||||
|
@@ -9,13 +9,13 @@ import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
|||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
@@ -165,7 +165,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
ret.details.description = StringUtils.truncate(doc.body().text(), 255);
|
ret.details.description = StringUtils.truncate(doc.body().text(), 255);
|
||||||
ret.details.length = 128;
|
ret.details.length = 128;
|
||||||
|
|
||||||
ret.details.standard = HtmlStandard.HTML5;
|
ret.details.format = DocumentFormat.HTML5;
|
||||||
ret.details.linksExternal = List.of();
|
ret.details.linksExternal = List.of();
|
||||||
ret.details.linksInternal = List.of();
|
ret.details.linksInternal = List.of();
|
||||||
ret.state = UrlIndexingState.OK;
|
ret.state = UrlIndexingState.OK;
|
||||||
|
@@ -124,7 +124,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
document.details.title,
|
document.details.title,
|
||||||
document.details.description,
|
document.details.description,
|
||||||
HtmlFeature.encode(document.details.features),
|
HtmlFeature.encode(document.details.features),
|
||||||
document.details.standard.name(),
|
document.details.format.name(),
|
||||||
document.details.length,
|
document.details.length,
|
||||||
document.details.hashCode,
|
document.details.hashCode,
|
||||||
(float) document.details.quality,
|
(float) document.details.quality,
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -6,6 +6,7 @@ import com.google.inject.Injector;
|
|||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
@@ -13,7 +14,6 @@ import nu.marginalia.model.crawl.UrlIndexingState;
|
|||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -91,7 +91,7 @@ public class ConvertingIntegrationTest {
|
|||||||
|
|
||||||
assertTrue(details.title.length() > 4);
|
assertTrue(details.title.length() > 4);
|
||||||
assertTrue(details.description.length() > 4);
|
assertTrue(details.description.length() > 4);
|
||||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
assertEquals(DocumentFormat.HTML5, details.format);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -125,7 +125,7 @@ public class ConvertingIntegrationTest {
|
|||||||
assertTrue(details.metadata.size() > 0);
|
assertTrue(details.metadata.size() > 0);
|
||||||
assertTrue(details.title.length() > 4);
|
assertTrue(details.title.length() > 4);
|
||||||
assertTrue(details.description.length() > 4);
|
assertTrue(details.description.length() > 4);
|
||||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
assertEquals(DocumentFormat.HTML5, details.format);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -0,0 +1,95 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||||
|
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||||
|
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||||
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
class PdfDocumentProcessorPluginTest {
|
||||||
|
static PdfDocumentProcessorPlugin plugin;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
static void setUpBeforeClass() throws Exception {
|
||||||
|
var lm = WmsaHome.getLanguageModels();
|
||||||
|
plugin = new PdfDocumentProcessorPlugin(255,
|
||||||
|
new LanguageFilter(lm),
|
||||||
|
new ThreadLocalSentenceExtractorProvider(lm),
|
||||||
|
new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
|
||||||
|
new DocumentLengthLogic(100),
|
||||||
|
new DefaultSpecialization(new SummaryExtractor(
|
||||||
|
255,
|
||||||
|
new DomFilterHeuristic(255),
|
||||||
|
new TagDensityHeuristic(255),
|
||||||
|
new OpenGraphDescriptionHeuristic(),
|
||||||
|
new MetaDescriptionHeuristic(),
|
||||||
|
new FallbackHeuristic()
|
||||||
|
),
|
||||||
|
new TitleExtractor(255)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||||
|
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
|
||||||
|
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(Path file) throws Exception {
|
||||||
|
return testPdfFile(Files.readAllBytes(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] downloadPDF(String url) throws IOException, URISyntaxException {
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) new URI(url).toURL().openConnection();
|
||||||
|
try {
|
||||||
|
return conn.getInputStream().readAllBytes();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
} finally {
|
||||||
|
conn.disconnect();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testingTool() throws Exception {
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample2.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample3.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample4.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample5.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample6.pdf")).details().title);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testingTool2() throws Exception {
|
||||||
|
System.out.println(plugin.convertPdfToHtml(Files.readAllBytes(Path.of("/home/st_work/Work/sample6.pdf"))));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testMarginaliaSample() throws Exception {
|
||||||
|
var doc = plugin.convertPdfToHtml(downloadPDF("https://www.marginalia.nu/junk/test.pdf"));
|
||||||
|
System.out.println(doc.html());
|
||||||
|
}
|
||||||
|
}
|
@@ -3,8 +3,8 @@ package nu.marginalia.converting.processor.pubdate;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ class PubDateSnifferTest {
|
|||||||
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -90,7 +90,7 @@ class PubDateSnifferTest {
|
|||||||
<time>2022-08-24</time>
|
<time>2022-08-24</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -106,7 +106,7 @@ class PubDateSnifferTest {
|
|||||||
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2006, ret.year());
|
assertEquals(2006, ret.year());
|
||||||
@@ -116,14 +116,14 @@ class PubDateSnifferTest {
|
|||||||
public void testProblemCases() throws IOException, URISyntaxException {
|
public void testProblemCases() throws IOException, URISyntaxException {
|
||||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||||
new EdgeUrl("https://www.example.com/"),
|
new EdgeUrl("https://www.example.com/"),
|
||||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
|
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), DocumentFormat.HTML5, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2006, ret.year());
|
assertEquals(2006, ret.year());
|
||||||
|
|
||||||
ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||||
new EdgeUrl("https://www.example.com/"),
|
new EdgeUrl("https://www.example.com/"),
|
||||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
|
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), DocumentFormat.XHTML, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2010, ret.year());
|
assertEquals(2010, ret.year());
|
||||||
@@ -146,7 +146,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<meta itemprop="datePublished" content="2022-08-24" />
|
<meta itemprop="datePublished" content="2022-08-24" />
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -160,7 +160,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<meta property="datePublished" content="2022-08-24" />
|
<meta property="datePublished" content="2022-08-24" />
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -174,7 +174,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2004-08-24", ret.dateIso8601());
|
assertEquals("2004-08-24", ret.dateIso8601());
|
||||||
@@ -188,7 +188,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
|
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2016-12-27", ret.dateIso8601());
|
assertEquals("2016-12-27", ret.dateIso8601());
|
||||||
@@ -202,7 +202,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<title>No date in the HTML</title>
|
<title>No date in the HTML</title>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
@@ -217,7 +217,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<title>No date in the HTML</title>
|
<title>No date in the HTML</title>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-02-03", ret.dateIso8601());
|
assertEquals("2022-02-03", ret.dateIso8601());
|
||||||
@@ -232,7 +232,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<p>Published 2003, updated 2022</p>
|
<p>Published 2003, updated 2022</p>
|
||||||
"""), HtmlStandard.HTML5, true);
|
"""), DocumentFormat.HTML5, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
@@ -258,7 +258,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
|
@@ -67,8 +67,6 @@ dependencies {
|
|||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
testImplementation libs.wiremock
|
testImplementation libs.wiremock
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
testImplementation project(':code:processes:test-data')
|
testImplementation project(':code:processes:test-data')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
// We don't have a lock, so we can't run this task
|
// We don't have a lock, so we can't run this task
|
||||||
// we return to avoid blocking the pool for too long
|
// we return to avoid blocking the pool for too long
|
||||||
if (lock.isEmpty()) {
|
if (lock.isEmpty()) {
|
||||||
if (retryQueue.remainingCapacity() > 0) {
|
pendingCrawlTasks.remove(domain);
|
||||||
// Sleep a moment to avoid busy looping via the retry queue
|
|
||||||
// in the case when few tasks remain and almost all are ineligible for
|
|
||||||
// immediate restart
|
|
||||||
Thread.sleep(5);
|
|
||||||
}
|
|
||||||
|
|
||||||
retryQueue.put(this);
|
retryQueue.put(this);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@@ -74,7 +74,7 @@ public class CrawlerRevisitor {
|
|||||||
|
|
||||||
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
|
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
|
||||||
// unlikely to produce anything meaningful for us.
|
// unlikely to produce anything meaningful for us.
|
||||||
if (doc.httpStatus != 200)
|
if (doc.httpStatus != 200 && doc.httpStatus != 206)
|
||||||
continue;
|
continue;
|
||||||
if (!doc.hasBody())
|
if (!doc.hasBody())
|
||||||
continue;
|
continue;
|
||||||
|
@@ -58,7 +58,7 @@ public record DocumentWithReference(
|
|||||||
if (null == doc)
|
if (null == doc)
|
||||||
return ContentTags.empty();
|
return ContentTags.empty();
|
||||||
|
|
||||||
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
|
if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
|
||||||
return ContentTags.empty();
|
return ContentTags.empty();
|
||||||
|
|
||||||
String lastmod = doc.getLastModified();
|
String lastmod = doc.getLastModified();
|
||||||
|
@@ -1,19 +1,23 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
public class ContentTypes {
|
public class ContentTypes {
|
||||||
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||||
"application/xhtml",
|
"application/xhtml",
|
||||||
"text/html",
|
"text/html",
|
||||||
|
"text/markdown",
|
||||||
|
"text/x-markdown",
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
"image/x-icon",
|
"image/x-icon",
|
||||||
"text/plain");
|
"text/plain");
|
||||||
|
|
||||||
public static boolean isAccepted(String contentTypeHeader) {
|
public static boolean isAccepted(String contentTypeHeader) {
|
||||||
String lcHeader = contentTypeHeader.toLowerCase();
|
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
|
||||||
for (var type : acceptedContentTypes) {
|
for (var type : acceptedContentTypes) {
|
||||||
if (lcHeader.startsWith(type)) {
|
if (lcHeader.equals(type)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -21,7 +25,7 @@ public class ContentTypes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isBinary(String contentTypeHeader) {
|
public static boolean isBinary(String contentTypeHeader) {
|
||||||
String lcHeader = contentTypeHeader.toLowerCase();
|
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
|
||||||
return lcHeader.startsWith("application/pdf");
|
return lcHeader.startsWith("application/pdf");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -158,11 +158,12 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
// and is used to store old responses from previous crawls; in this part of the logic
|
// and is used to store old responses from previous crawls; in this part of the logic
|
||||||
// we treat them the same as a normal response
|
// we treat them the same as a normal response
|
||||||
|
|
||||||
if (!filterResponse(uaString, response)) {
|
var filterStatus = filterResponse(uaString, response);
|
||||||
|
if (filterStatus.isRejected()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
slopWriter.write(domain, response);
|
slopWriter.write(domain, filterStatus, response);
|
||||||
} else if (record instanceof WarcXEntityRefused refused) {
|
} else if (record instanceof WarcXEntityRefused refused) {
|
||||||
slopWriter.write(domain, refused);
|
slopWriter.write(domain, refused);
|
||||||
} else if (record instanceof Warcinfo warcinfo) {
|
} else if (record instanceof Warcinfo warcinfo) {
|
||||||
@@ -187,25 +188,35 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sealed interface ResponseFilterResult {
|
||||||
|
default boolean isRejected() { return false; }
|
||||||
|
record Accept() implements ResponseFilterResult {}
|
||||||
|
record AcceptWithContentType(String contentType) implements ResponseFilterResult {}
|
||||||
|
record AcceptIfPlainText(String contentType) implements ResponseFilterResult {}
|
||||||
|
record Reject() implements ResponseFilterResult {
|
||||||
|
@Override
|
||||||
|
public boolean isRejected() { return true; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Return true if the WarcResponse should be excluded from conversion */
|
/** Return true if the WarcResponse should be excluded from conversion */
|
||||||
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
private static ResponseFilterResult filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||||
|
|
||||||
// We don't want to store robots.txt files, as they are not
|
// We don't want to store robots.txt files, as they are not
|
||||||
// interesting for the analysis we want to do. This is important
|
// interesting for the analysis we want to do. This is important
|
||||||
// since txt-files in general are interesting, and we don't want to
|
// since txt-files in general are interesting, and we don't want to
|
||||||
// exclude them as a class.
|
// exclude them as a class.
|
||||||
|
|
||||||
if (response.targetURI().getPath().equals("/robots.txt")) {
|
String uriPath = response.targetURI().getPath();
|
||||||
return false;
|
if (uriPath.equals("/robots.txt")) {
|
||||||
|
return new ResponseFilterResult.Reject();
|
||||||
}
|
}
|
||||||
|
|
||||||
var headers = response.http().headers();
|
var headers = response.http().headers();
|
||||||
var robotsTags = headers.all("X-Robots-Tag");
|
var robotsTags = headers.all("X-Robots-Tag");
|
||||||
|
|
||||||
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
||||||
return false;
|
return new ResponseFilterResult.Reject();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Strip out responses with content types we aren't interested in
|
// Strip out responses with content types we aren't interested in
|
||||||
@@ -213,15 +224,29 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
|
String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
|
||||||
|
|
||||||
if (!ContentTypes.isAccepted(contentType)) {
|
if (!ContentTypes.isAccepted(contentType)) {
|
||||||
return false;
|
String contentTypeWithoutParams = StringUtils.substringBefore(contentType, ";");
|
||||||
|
|
||||||
|
// Some servers don't understand what a markdown file is
|
||||||
|
if (contentTypeWithoutParams.equals("application/octet-stream")) {
|
||||||
|
if (uriPath.endsWith(".md")) {
|
||||||
|
// This is a markdown file, which we want to keep
|
||||||
|
return new ResponseFilterResult.AcceptIfPlainText("text/markdown");
|
||||||
|
}
|
||||||
|
else if (uriPath.endsWith(".pdf")) {
|
||||||
|
// This is a text file, which we want to keep
|
||||||
|
return new ResponseFilterResult.AcceptWithContentType("application/pdf");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ResponseFilterResult.Reject();
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the format is binary, we don't want to translate it if the response is truncated
|
// If the format is binary, we don't want to translate it if the response is truncated
|
||||||
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
|
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
|
||||||
return false;
|
return new ResponseFilterResult.Reject();
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return new ResponseFilterResult.Accept();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
|
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
|
||||||
@@ -277,7 +302,8 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
try (var table = new SlopTable(path)) {
|
try (var table = new SlopTable(path)) {
|
||||||
ShortColumn.Reader statusReader = statusColumn.open(table);
|
ShortColumn.Reader statusReader = statusColumn.open(table);
|
||||||
while (statusReader.hasRemaining()) {
|
while (statusReader.hasRemaining()) {
|
||||||
if (statusReader.get() == 200) {
|
int status = statusReader.get();
|
||||||
|
if (status == 200 || status == 206) {
|
||||||
cnt++;
|
cnt++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -323,7 +349,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
headerColumnWriter.put(record.headers);
|
headerColumnWriter.put(record.headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(String domain, WarcResponse response) throws IOException {
|
public void write(String domain, ResponseFilterResult filterStatus, WarcResponse response) throws IOException {
|
||||||
|
|
||||||
HttpFetchResult result = HttpFetchResult.importWarc(response);
|
HttpFetchResult result = HttpFetchResult.importWarc(response);
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
|
if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
|
||||||
@@ -346,6 +372,21 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
contentType = "";
|
contentType = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
switch (filterStatus) {
|
||||||
|
case ResponseFilterResult.AcceptWithContentType(String ct) -> contentType = ct;
|
||||||
|
case ResponseFilterResult.AcceptIfPlainText(String ct) -> {
|
||||||
|
try {
|
||||||
|
// Parse the body as UTF-8
|
||||||
|
new String(bodyBytes, StandardCharsets.UTF_8);
|
||||||
|
contentType = ct;
|
||||||
|
}
|
||||||
|
catch (RuntimeException ex) { // UTF-8 decoding failed
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default -> {}
|
||||||
|
}
|
||||||
|
|
||||||
boolean hasCookies = false;
|
boolean hasCookies = false;
|
||||||
|
|
||||||
String headersStr;
|
String headersStr;
|
||||||
|
@@ -117,6 +117,100 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void verifyFileFormatSupport() throws IOException {
|
||||||
|
List<String> urls = List.of(
|
||||||
|
"https://www.marginalia.nu/junk/test.pdf",
|
||||||
|
"https://www.marginalia.nu/junk/test.md"
|
||||||
|
);
|
||||||
|
|
||||||
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
|
.builder()
|
||||||
|
.crawlDepth(5)
|
||||||
|
.domain("www.marginalia.nu")
|
||||||
|
.urls(urls)
|
||||||
|
.build();
|
||||||
|
Path tempFile = null;
|
||||||
|
Path slopFile = null;
|
||||||
|
try {
|
||||||
|
tempFile = Files.createTempFile("crawling-process", "warc");
|
||||||
|
slopFile = Files.createTempFile("crawling-process", ".slop.zip");
|
||||||
|
|
||||||
|
doCrawl(tempFile, specs);
|
||||||
|
|
||||||
|
Set<String> requests = new HashSet<>();
|
||||||
|
Set<String> responses = new HashSet<>();
|
||||||
|
|
||||||
|
// Inspect the WARC file
|
||||||
|
try (var reader = new WarcReader(tempFile)) {
|
||||||
|
reader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
requests.add(req.target());
|
||||||
|
System.out.println(req.type() + ":" + req.target());
|
||||||
|
}
|
||||||
|
else if (record instanceof WarcResponse rsp) {
|
||||||
|
responses.add(rsp.target());
|
||||||
|
try {
|
||||||
|
System.out.println(rsp.type() + ":" + rsp.target() + ":" + rsp.http().contentType());
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println(record.type());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var url : urls) {
|
||||||
|
assertTrue(requests.contains(url), "Should have requested " + url);
|
||||||
|
}
|
||||||
|
assertEquals(requests, responses);
|
||||||
|
|
||||||
|
// Convert the WARC file to a Slop file
|
||||||
|
SlopCrawlDataRecord
|
||||||
|
.convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
|
||||||
|
|
||||||
|
CrawledDomain domain = null;
|
||||||
|
Map<String, CrawledDocument> documents = new HashMap<>();
|
||||||
|
|
||||||
|
// Extract the contents of the Slop file
|
||||||
|
try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
var doc = stream.next();
|
||||||
|
if (doc instanceof CrawledDomain dr) {
|
||||||
|
assertNull(domain);
|
||||||
|
domain = dr;
|
||||||
|
}
|
||||||
|
else if (doc instanceof CrawledDocument dc) {
|
||||||
|
System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
|
||||||
|
documents.put(dc.url, dc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var url : urls) {
|
||||||
|
// Verify we have the downloaded files in the Slop file
|
||||||
|
assertNotNull(domain);
|
||||||
|
var fetchedDoc = documents.get(url);
|
||||||
|
assertNotNull(fetchedDoc, "Should have a document for " + url);
|
||||||
|
assertEquals(url, fetchedDoc.url);
|
||||||
|
assertTrue(fetchedDoc.httpStatus == 200 || fetchedDoc.httpStatus == 206, "Should be 200 or 206 for " + url);
|
||||||
|
assertTrue(fetchedDoc.documentBodyBytes.length > 32, "Should have a body for " + url);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
} finally {
|
||||||
|
if (tempFile != null)
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
if (slopFile != null)
|
||||||
|
Files.deleteIfExists(slopFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWarcOutputNoKnownUrls() throws IOException {
|
public void testWarcOutputNoKnownUrls() throws IOException {
|
||||||
var specs = CrawlerMain.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.extractor;
|
package nu.marginalia.extractor;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.process.log.WorkLogEntry;
|
import nu.marginalia.process.log.WorkLogEntry;
|
||||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||||
@@ -20,17 +21,18 @@ import java.nio.file.StandardCopyOption;
|
|||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
import java.nio.file.attribute.PosixFilePermissions;
|
import java.nio.file.attribute.PosixFilePermissions;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class SampleDataExporter {
|
public class SampleDataExporter {
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
|
private final ProcessHeartbeat processHeartbeat;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SampleDataExporter(FileStorageService storageService) {
|
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
|
this.processHeartbeat = processHeartbeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
|
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
|
||||||
FileStorage destStorage = storageService.getStorage(destId);
|
FileStorage destStorage = storageService.getStorage(destId);
|
||||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||||
@@ -59,12 +61,6 @@ public class SampleDataExporter {
|
|||||||
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
|
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
|
||||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||||
|
|
||||||
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
|
|
||||||
for (var item : entriesAll) {
|
|
||||||
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
||||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||||
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
|
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
|
||||||
@@ -72,29 +68,38 @@ public class SampleDataExporter {
|
|||||||
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
||||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||||
|
|
||||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
|
||||||
for (var item : entriesAll) {
|
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||||
|
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
|
||||||
|
) {
|
||||||
|
for (var item : hb.wrap("Scanning", entriesAll)) {
|
||||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||||
if (!Files.exists(crawlDataPath)) continue;
|
if (!Files.exists(crawlDataPath)) continue;
|
||||||
|
|
||||||
if (StringUtils.isBlank(ctFilter)) {
|
if (StringUtils.isBlank(ctFilter)) {
|
||||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||||
|
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||||
}
|
}
|
||||||
else /* filter != null */ {
|
else /* filter != null */ {
|
||||||
boolean didFilterData = false;
|
Path filteredData = null;
|
||||||
try {
|
try {
|
||||||
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
|
filteredData = filterEntries(crawlDataPath, ctFilter);
|
||||||
didFilterData = true;
|
addFileToTar(stream, filteredData, item.relPath());
|
||||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||||
|
}
|
||||||
|
catch (NoSuchElementException ex) {
|
||||||
|
// Ignore
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
if (didFilterData) {
|
if (filteredData != null) {
|
||||||
Files.deleteIfExists(crawlDataPath);
|
Files.deleteIfExists(filteredData);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logWriter.flush();
|
||||||
|
|
||||||
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
|
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
|
||||||
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
|
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
|
||||||
}
|
}
|
||||||
@@ -106,34 +111,44 @@ public class SampleDataExporter {
|
|||||||
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Filters the entries in the crawl data file based on the content type.
|
/** Filters the entries in the crawl data file based on the content type. */
|
||||||
* @param crawlDataPath The path to the crawl data file.
|
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
|
||||||
* @param contentTypeFilter The content type to filter by.
|
|
||||||
* @return The path to the filtered crawl data file, or null if an error occurred.
|
|
||||||
*/
|
|
||||||
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
|
|
||||||
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
||||||
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
||||||
|
|
||||||
|
// We may have debris from a previous run, so let's clean it up
|
||||||
|
if (Files.isDirectory(tempDir)) {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
Files.createDirectory(tempDir);
|
Files.createDirectory(tempDir);
|
||||||
|
|
||||||
|
boolean wroteEntry = false;
|
||||||
|
|
||||||
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
||||||
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
||||||
@Override
|
@Override
|
||||||
public boolean filter(String url, int status, String contentType) {
|
public boolean filter(String url, int status, String contentType) {
|
||||||
if (contentTypeFilter.equals(contentType))
|
return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
|
||||||
return true;
|
|| contentType.startsWith("x-marginalia/"); // metadata records
|
||||||
else if (contentType.startsWith("x-marginalia/"))
|
|
||||||
// This is a metadata entry, typically domain or redirect information
|
|
||||||
// let's keep those to not confuse the consumer of the data, which might
|
|
||||||
// expect at least the domain summary
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
) {
|
) {
|
||||||
|
|
||||||
while (reader.hasRemaining()) {
|
while (reader.hasRemaining()) {
|
||||||
writer.write(reader.get());
|
var entry = reader.get();
|
||||||
|
writer.write(entry);
|
||||||
|
|
||||||
|
wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (!wroteEntry) {
|
||||||
|
throw new NoSuchElementException("No relevant entries");
|
||||||
}
|
}
|
||||||
|
|
||||||
SlopTablePacker.packToSlopZip(tempDir, tempFile);
|
SlopTablePacker.packToSlopZip(tempDir, tempFile);
|
||||||
|
@@ -90,6 +90,7 @@ public class ApiSearchOperator {
|
|||||||
url.getTitle(),
|
url.getTitle(),
|
||||||
url.getDescription(),
|
url.getDescription(),
|
||||||
sanitizeNaN(url.rankingScore, -100),
|
sanitizeNaN(url.rankingScore, -100),
|
||||||
|
url.getShortFormat(),
|
||||||
details
|
details
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -8,14 +8,16 @@ public class ApiSearchResult {
|
|||||||
public String title;
|
public String title;
|
||||||
public String description;
|
public String description;
|
||||||
public double quality;
|
public double quality;
|
||||||
|
public String format; // "pdf", "html", "text", etc.
|
||||||
|
|
||||||
public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||||
|
|
||||||
public ApiSearchResult(String url, String title, String description, double quality, List<List<ApiSearchResultQueryDetails>> details) {
|
public ApiSearchResult(String url, String title, String description, double quality, String format, List<List<ApiSearchResultQueryDetails>> details) {
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.title = title;
|
this.title = title;
|
||||||
this.description = description;
|
this.description = description;
|
||||||
this.quality = quality;
|
this.quality = quality;
|
||||||
|
this.format = format;
|
||||||
this.details = details;
|
this.details = details;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -73,6 +73,8 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
|||||||
return "HTML 5";
|
return "HTML 5";
|
||||||
case "PLAIN":
|
case "PLAIN":
|
||||||
return "Plain Text";
|
return "Plain Text";
|
||||||
|
case "PDF":
|
||||||
|
return "PDF";
|
||||||
default:
|
default:
|
||||||
return "?";
|
return "?";
|
||||||
}
|
}
|
||||||
|
@@ -17,7 +17,7 @@ public class SearchQueryParamFactory {
|
|||||||
static final RpcQueryLimits defaultLimits = RpcQueryLimits.newBuilder()
|
static final RpcQueryLimits defaultLimits = RpcQueryLimits.newBuilder()
|
||||||
.setResultsTotal(100)
|
.setResultsTotal(100)
|
||||||
.setResultsByDomain(5)
|
.setResultsByDomain(5)
|
||||||
.setTimeoutMs(200)
|
.setTimeoutMs(250)
|
||||||
.setFetchSize(8192)
|
.setFetchSize(8192)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
@@ -78,6 +78,8 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
|||||||
return "HTML 5";
|
return "HTML 5";
|
||||||
case "PLAIN":
|
case "PLAIN":
|
||||||
return "Plain Text";
|
return "Plain Text";
|
||||||
|
case "PDF":
|
||||||
|
return "PDF";
|
||||||
default:
|
default:
|
||||||
return "?";
|
return "?";
|
||||||
}
|
}
|
||||||
@@ -92,13 +94,24 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
|||||||
public String displayTitle() {
|
public String displayTitle() {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
buildDisplayTitle(sb, title);
|
||||||
|
|
||||||
|
if (sb.isEmpty()) {
|
||||||
|
buildDisplayTitle(sb, url.toDisplayString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void buildDisplayTitle(StringBuilder sb, String str) {
|
||||||
|
|
||||||
int distSinceBreak = 0;
|
int distSinceBreak = 0;
|
||||||
|
|
||||||
char c = ' ';
|
char c = ' ';
|
||||||
int prevC = ' ';
|
int prevC = ' ';
|
||||||
for (int i = 0; i < title.length(); i++) {
|
for (int i = 0; i < str.length(); i++) {
|
||||||
prevC = c;
|
prevC = c;
|
||||||
c = title.charAt(i);
|
c = str.charAt(i);
|
||||||
|
|
||||||
if (Character.isSpaceChar(c)) {
|
if (Character.isSpaceChar(c)) {
|
||||||
distSinceBreak = 0;
|
distSinceBreak = 0;
|
||||||
@@ -135,8 +148,6 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
|||||||
sb.append(c);
|
sb.append(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Helper that inserts hyphenation hints and escapes
|
/** Helper that inserts hyphenation hints and escapes
|
||||||
|
@@ -21,6 +21,9 @@
|
|||||||
</h2>
|
</h2>
|
||||||
|
|
||||||
<div class="text-sm mt-1">
|
<div class="text-sm mt-1">
|
||||||
|
@if ("PDF".equals(result.first.format))
|
||||||
|
<i title="PDF" class="fas fa-file-pdf text-red-500"></i>
|
||||||
|
@endif
|
||||||
<a class="text-liteblue dark:text-blue-200 underline break-all" href="${result.first.url.toString()}"
|
<a class="text-liteblue dark:text-blue-200 underline break-all" href="${result.first.url.toString()}"
|
||||||
rel="noopener noreferrer" tabindex="-1">$unsafe{result.first.displayUrl()}</a>
|
rel="noopener noreferrer" tabindex="-1">$unsafe{result.first.displayUrl()}</a>
|
||||||
</div>
|
</div>
|
||||||
@@ -53,10 +56,13 @@
|
|||||||
<div class="flex mt-2 text-sm flex flex-col space-y-2">
|
<div class="flex mt-2 text-sm flex flex-col space-y-2">
|
||||||
<p class="text-black dark:text-white ${result.colorScheme.backgroundColor} p-1 rounded break-words hyphens-auto">Also from ${result.getDomain().toString()}:</p>
|
<p class="text-black dark:text-white ${result.colorScheme.backgroundColor} p-1 rounded break-words hyphens-auto">Also from ${result.getDomain().toString()}:</p>
|
||||||
|
|
||||||
<ul class="pl-2 mt-2 underline text-liteblue dark:text-blue-200">
|
<ul class="pl-2 mt-2 text-liteblue dark:text-blue-200">
|
||||||
@for(UrlDetails item : result.rest)
|
@for(UrlDetails item : result.rest)
|
||||||
<li class="-indent-4 pl-4 mb-1 break-words hyphens-auto">
|
<li class="-indent-4 pl-4 mb-1 break-words hyphens-auto">
|
||||||
<a href="${item.url.toString()}" rel="noopener noreferrer">$unsafe{item.displayTitle()}</a>
|
@if ("PDF".equals(item.format))
|
||||||
|
<i title="PDF" class="fas fa-file-pdf text-red-500"></i>
|
||||||
|
@endif
|
||||||
|
<a href="${item.url.toString()}" class="underline" rel="noopener noreferrer">$unsafe{item.displayTitle()}</a>
|
||||||
</li>
|
</li>
|
||||||
@endfor
|
@endfor
|
||||||
</ul>
|
</ul>
|
||||||
@@ -74,6 +80,9 @@
|
|||||||
@if (DocumentFlags.PlainText.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
@if (DocumentFlags.PlainText.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
||||||
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">Plain text</span>
|
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">Plain text</span>
|
||||||
@endif
|
@endif
|
||||||
|
@if (DocumentFlags.PdfFile.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
||||||
|
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">PDF File</span>
|
||||||
|
@endif
|
||||||
@if (DocumentFlags.GeneratorForum.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
@if (DocumentFlags.GeneratorForum.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
||||||
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">Forum</span>
|
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">Forum</span>
|
||||||
@endif
|
@endif
|
||||||
|
@@ -4,4 +4,7 @@
|
|||||||
2025-01-07: Deploy executor.
|
2025-01-07: Deploy executor.
|
||||||
2025-04-24: Deploy executor.
|
2025-04-24: Deploy executor.
|
||||||
2025-04-24: Deploy assistant.
|
2025-04-24: Deploy assistant.
|
||||||
2025-04-24: Deploy qs, search and api-services.
|
2025-05-04: Deploy qs, search and api-services.
|
||||||
|
2025-05-05: Deploy executor partition 4.
|
||||||
|
2025-05-05: Deploy control.
|
||||||
|
2025-05-08: Deploy assistant.
|
@@ -236,7 +236,7 @@ dependencyResolutionManagement {
|
|||||||
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
||||||
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
||||||
|
|
||||||
library('slop', 'nu.marginalia', 'slop').version('0.0.10-SNAPSHOT')
|
library('slop', 'nu.marginalia', 'slop').version('0.0.11-SNAPSHOT')
|
||||||
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
||||||
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
||||||
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
||||||
@@ -244,6 +244,7 @@ dependencyResolutionManagement {
|
|||||||
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
|
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
|
||||||
library('jte','gg.jte','jte').version('3.1.15')
|
library('jte','gg.jte','jte').version('3.1.15')
|
||||||
|
|
||||||
|
library('pdfbox', 'org.apache.pdfbox', 'pdfbox').version('3.0.5')
|
||||||
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
||||||
|
|
||||||
bundle('slf4j', ['slf4j.api', 'log4j.api', 'log4j.core', 'log4j.slf4j'])
|
bundle('slf4j', ['slf4j.api', 'log4j.api', 'log4j.core', 'log4j.slf4j'])
|
||||||
|
Reference in New Issue
Block a user