mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
72 Commits
deploy-017
...
deploy-020
Author | SHA1 | Date | |
---|---|---|---|
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
be039d1a8c | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 | ||
|
599534806b | ||
|
7e8253dac7 | ||
|
97a6780ea3 | ||
|
eb634beec8 | ||
|
269ebd1654 | ||
|
39ce40bfeb | ||
|
c187b2e1c1 | ||
|
42eaa4588b | ||
|
4f40a5fbeb | ||
|
3f3d42bc01 | ||
|
61c8d53e1b | ||
|
a7a3d85be9 | ||
|
306232fb54 | ||
|
5aef844f0d | ||
|
d56b5c828a | ||
|
ab58a4636f | ||
|
00be269238 | ||
|
879e6a9424 | ||
|
fba3455732 | ||
|
14283da7f5 | ||
|
93df4d1fc0 | ||
|
b12a0b998c | ||
|
3b6f4e321b | ||
|
8428111771 | ||
|
e9fd4415ef | ||
|
4c95c3dcad | ||
|
c5281536fb | ||
|
4431dae7ac | ||
|
4df4d0a7a8 | ||
|
9f05083b94 | ||
|
fc92e9b9c0 | ||
|
328fb5d927 | ||
|
36889950e8 | ||
|
c96a94878b | ||
|
1c57d7d73a | ||
|
a443d22356 | ||
|
aa59d4afa4 | ||
|
df0f18d0e7 | ||
|
0819d46f97 |
@@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.model;
|
||||||
|
|
||||||
|
public enum DocumentFormat {
|
||||||
|
PLAIN(0, 1, "text"),
|
||||||
|
PDF(0, 1, "pdf"),
|
||||||
|
UNKNOWN(0, 1, "???"),
|
||||||
|
HTML123(0, 1, "html"),
|
||||||
|
HTML4(-0.1, 1.05, "html"),
|
||||||
|
XHTML(-0.1, 1.05, "html"),
|
||||||
|
HTML5(0.5, 1.1, "html");
|
||||||
|
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double offset;
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double scale;
|
||||||
|
public final String shortFormat;
|
||||||
|
|
||||||
|
DocumentFormat(double offset, double scale, String shortFormat) {
|
||||||
|
this.offset = offset;
|
||||||
|
this.scale = scale;
|
||||||
|
this.shortFormat = shortFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -112,14 +112,6 @@ public class EdgeDomain implements Serializable {
|
|||||||
return topDomain;
|
return topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDomainKey() {
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
return topDomain;
|
|
||||||
}
|
|
||||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** If possible, try to provide an alias domain,
|
/** If possible, try to provide an alias domain,
|
||||||
* i.e. a domain name that is very likely to link to this one
|
* i.e. a domain name that is very likely to link to this one
|
||||||
* */
|
* */
|
||||||
|
@@ -28,6 +28,8 @@ public enum HtmlFeature {
|
|||||||
|
|
||||||
GA_SPAM("special:gaspam"),
|
GA_SPAM("special:gaspam"),
|
||||||
|
|
||||||
|
PDF("format:pdf"),
|
||||||
|
|
||||||
/** For fingerprinting and ranking */
|
/** For fingerprinting and ranking */
|
||||||
OPENGRAPH("special:opengraph"),
|
OPENGRAPH("special:opengraph"),
|
||||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||||
|
@@ -1,22 +0,0 @@
|
|||||||
package nu.marginalia.model.html;
|
|
||||||
|
|
||||||
// This class really doesn't belong anywhere, but will squat here for now
|
|
||||||
public enum HtmlStandard {
|
|
||||||
PLAIN(0, 1),
|
|
||||||
UNKNOWN(0, 1),
|
|
||||||
HTML123(0, 1),
|
|
||||||
HTML4(-0.1, 1.05),
|
|
||||||
XHTML(-0.1, 1.05),
|
|
||||||
HTML5(0.5, 1.1);
|
|
||||||
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double offset;
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double scale;
|
|
||||||
|
|
||||||
HtmlStandard(double offset, double scale) {
|
|
||||||
this.offset = offset;
|
|
||||||
this.scale = scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@@ -9,7 +9,7 @@ public enum DocumentFlags {
|
|||||||
GeneratorForum,
|
GeneratorForum,
|
||||||
GeneratorWiki,
|
GeneratorWiki,
|
||||||
Sideloaded,
|
Sideloaded,
|
||||||
Unused7,
|
PdfFile,
|
||||||
Unused8,
|
Unused8,
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||||||
|
|
||||||
class EdgeDomainTest {
|
class EdgeDomainTest {
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSkepdic() throws URISyntaxException {
|
|
||||||
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
|
|
||||||
assertEquals("skepdic", domain.getDomain().getDomainKey());
|
|
||||||
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
|
|
||||||
assertEquals("skepdic", domain2.getDomain().getDomainKey());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHkDomain() throws URISyntaxException {
|
public void testHkDomain() throws URISyntaxException {
|
||||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||||
|
@@ -39,7 +39,8 @@
|
|||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="Console"/>
|
||||||
<AppenderRef ref="ProcessConsole"/>
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
|
@@ -72,7 +72,8 @@
|
|||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="ConsoleInfo"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
<AppenderRef ref="ConsoleWarn"/>
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
@@ -37,7 +37,8 @@
|
|||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="ConsoleInfo"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
<AppenderRef ref="ConsoleWarn"/>
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
@@ -25,9 +25,9 @@ dependencies {
|
|||||||
|
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
|
implementation project(':third-party:rssreader')
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation project(':third-party:rssreader')
|
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.slop
|
implementation libs.slop
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
@@ -57,8 +57,6 @@ dependencies {
|
|||||||
implementation libs.bundles.gson
|
implementation libs.bundles.gson
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
@@ -0,0 +1,126 @@
|
|||||||
|
package nu.marginalia.domsample;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import jakarta.inject.Named;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
|
import nu.marginalia.livecapture.BrowserlessClient;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
public class DomSampleService {
|
||||||
|
private final DomSampleDb db;
|
||||||
|
private final HikariDataSource mariadbDataSource;
|
||||||
|
private final URI browserlessURI;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomSampleService(DomSampleDb db,
|
||||||
|
HikariDataSource mariadbDataSource,
|
||||||
|
@Named("browserless-uri") String browserlessAddress,
|
||||||
|
ServiceConfiguration serviceConfiguration)
|
||||||
|
throws URISyntaxException
|
||||||
|
{
|
||||||
|
this.db = db;
|
||||||
|
this.mariadbDataSource = mariadbDataSource;
|
||||||
|
|
||||||
|
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||||
|
logger.warn("Live capture service will not run");
|
||||||
|
browserlessURI = null;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
browserlessURI = new URI(browserlessAddress);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void start() {
|
||||||
|
if (browserlessURI == null) {
|
||||||
|
logger.warn("DomSampleService is not enabled due to missing browserless URI or multi-node configuration");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon().start(this::run);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void syncDomains() {
|
||||||
|
Set<String> dbDomains = new HashSet<>();
|
||||||
|
|
||||||
|
logger.info("Fetching domains from database...");
|
||||||
|
|
||||||
|
try (var conn = mariadbDataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_NAME
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE NODE_AFFINITY>0
|
||||||
|
""")
|
||||||
|
) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
dbDomains.add(rs.getString("DOMAIN_NAME"));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("Failed to sync domains", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Found {} domains in database", dbDomains.size());
|
||||||
|
|
||||||
|
db.syncDomains(dbDomains);
|
||||||
|
|
||||||
|
logger.info("Synced domains to sqlite");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
|
||||||
|
while (!Thread.currentThread().isInterrupted()) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Grace sleep in case we're operating on an empty domain list
|
||||||
|
TimeUnit.SECONDS.sleep(15);
|
||||||
|
|
||||||
|
syncDomains();
|
||||||
|
var domains = db.getScheduledDomains();
|
||||||
|
|
||||||
|
for (var domain : domains) {
|
||||||
|
updateDomain(client, domain);
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
logger.info("DomSampleService interrupted, stopping...");
|
||||||
|
return;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error in DomSampleService run loop", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateDomain(BrowserlessClient client, String domain) {
|
||||||
|
var rootUrl = "https://" + domain + "/";
|
||||||
|
try {
|
||||||
|
var content = client.annotatedContent(rootUrl,
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues());
|
||||||
|
|
||||||
|
if (content.isPresent()) {
|
||||||
|
db.saveSample(domain, rootUrl, content.get());
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Failed to process domain: " + domain, e);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
db.flagDomainAsFetched(domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,176 @@
|
|||||||
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class DomSampleDb implements AutoCloseable {
|
||||||
|
private static final String dbFileName = "dom-sample.db";
|
||||||
|
private final Connection connection;
|
||||||
|
|
||||||
|
public DomSampleDb() throws SQLException{
|
||||||
|
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||||
|
}
|
||||||
|
|
||||||
|
public DomSampleDb(Path dbPath) throws SQLException {
|
||||||
|
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||||
|
|
||||||
|
connection = DriverManager.getConnection(dbUrl);
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||||
|
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||||
|
stmt.execute("PRAGMA journal_mode=WAL");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void syncDomains(Set<String> domains) {
|
||||||
|
Set<String> currentDomains = new HashSet<>();
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
currentDomains.add(rs.getString("domain"));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to sync domains", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> toRemove = new HashSet<>(currentDomains);
|
||||||
|
Set<String> toAdd = new HashSet<>(domains);
|
||||||
|
|
||||||
|
toRemove.removeAll(domains);
|
||||||
|
toAdd.removeAll(currentDomains);
|
||||||
|
|
||||||
|
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
|
||||||
|
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
|
||||||
|
) {
|
||||||
|
for (String domain : toRemove) {
|
||||||
|
removeStmt.setString(1, domain);
|
||||||
|
removeStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String domain : toAdd) {
|
||||||
|
addStmt.setString(1, domain);
|
||||||
|
addStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to remove domains", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getScheduledDomains() {
|
||||||
|
List<String> domains = new ArrayList<>();
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
domains.add(rs.getString("domain"));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to get scheduled domains", e);
|
||||||
|
}
|
||||||
|
return domains;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void flagDomainAsFetched(String domain) {
|
||||||
|
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to flag domain as fetched", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||||
|
|
||||||
|
public List<Sample> getSamples(String domain) throws SQLException {
|
||||||
|
List<Sample> samples = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT url, sample, requests, accepted_popover
|
||||||
|
FROM samples
|
||||||
|
WHERE domain = ?
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
samples.add(
|
||||||
|
new Sample(
|
||||||
|
rs.getString("url"),
|
||||||
|
domain,
|
||||||
|
rs.getString("sample"),
|
||||||
|
rs.getString("requests"),
|
||||||
|
rs.getBoolean("accepted_popover")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||||
|
var doc = Jsoup.parse(rawContent);
|
||||||
|
|
||||||
|
var networkRequests = doc.getElementById("marginalia-network-requests");
|
||||||
|
|
||||||
|
boolean acceptedPopover = false;
|
||||||
|
|
||||||
|
StringBuilder requestTsv = new StringBuilder();
|
||||||
|
if (networkRequests != null) {
|
||||||
|
|
||||||
|
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
|
||||||
|
|
||||||
|
for (var request : networkRequests.getElementsByClass("network-request")) {
|
||||||
|
String method = request.attr("data-method");
|
||||||
|
String urlAttr = request.attr("data-url");
|
||||||
|
String timestamp = request.attr("data-timestamp");
|
||||||
|
|
||||||
|
requestTsv
|
||||||
|
.append(method)
|
||||||
|
.append('\t')
|
||||||
|
.append(timestamp)
|
||||||
|
.append('\t')
|
||||||
|
.append(urlAttr.replace('\n', ' '))
|
||||||
|
.append("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
networkRequests.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
doc.body().removeAttr("id");
|
||||||
|
|
||||||
|
String sample = doc.html();
|
||||||
|
|
||||||
|
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
record Request(String url, String method, String timestamp, boolean acceptedPopover) {}
|
||||||
|
|
||||||
|
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE
|
||||||
|
INTO samples (domain, url, sample, requests, accepted_popover)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.setString(2, url);
|
||||||
|
stmt.setString(3, sample);
|
||||||
|
stmt.setString(4, requests);
|
||||||
|
stmt.setBoolean(5, acceptedPopover);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws SQLException {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
}
|
@@ -8,10 +8,13 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.net.URLEncoder;
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
@@ -60,6 +63,42 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
return Optional.of(rsp.body());
|
return Optional.of(rsp.body());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
|
||||||
|
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
|
||||||
|
*/
|
||||||
|
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||||
|
Map<String, Object> requestData = Map.of(
|
||||||
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
|
"gotoOptions", gotoOptions,
|
||||||
|
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Launch parameters for the browserless instance to load the extension
|
||||||
|
Map<String, Object> launchParameters = Map.of(
|
||||||
|
"args", List.of("--load-extension=/dom-export")
|
||||||
|
);
|
||||||
|
|
||||||
|
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
var request = HttpRequest.newBuilder()
|
||||||
|
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
|
||||||
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
|
gson.toJson(requestData)
|
||||||
|
))
|
||||||
|
.header("Content-type", "application/json")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
|
||||||
|
if (rsp.statusCode() >= 300) {
|
||||||
|
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(rsp.body());
|
||||||
|
}
|
||||||
|
|
||||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
|
|
||||||
@@ -102,7 +141,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
|
|
||||||
public record GotoOptions(String waitUntil, long timeout) {
|
public record GotoOptions(String waitUntil, long timeout) {
|
||||||
public static GotoOptions defaultValues() {
|
public static GotoOptions defaultValues() {
|
||||||
return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis());
|
return new GotoOptions("load", Duration.ofSeconds(10).toMillis());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -79,9 +79,17 @@ public class SimpleFeedParser {
|
|||||||
if (!link.isBlank())
|
if (!link.isBlank())
|
||||||
break;
|
break;
|
||||||
var tag = element.getElementsByTag(attr).first();
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
|
||||||
if (tag != null) {
|
if (tag != null) {
|
||||||
link = tag.text();
|
String linkText = tag.text();
|
||||||
|
|
||||||
|
if (linkText.isBlank()) {
|
||||||
|
linkText = tag.attr("href");
|
||||||
|
}
|
||||||
|
|
||||||
|
link = linkText;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.add(new ItemData(title, description, link, pubDate));
|
ret.add(new ItemData(title, description, link, pubDate));
|
||||||
|
@@ -0,0 +1,113 @@
|
|||||||
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class DomSampleDbTest {
|
||||||
|
Path tempDir;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws Exception {
|
||||||
|
tempDir = Files.createTempDirectory("test");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSetUp() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to set up database: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSyncDomains() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||||
|
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com"));
|
||||||
|
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
db.syncDomains(Set.of("foobar.com", "test.com"));
|
||||||
|
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to sync domains: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFetchDomains() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||||
|
db.flagDomainAsFetched("example.com");
|
||||||
|
db.flagDomainAsFetched("test.com");
|
||||||
|
db.flagDomainAsFetched("foobar.com");
|
||||||
|
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
|
||||||
|
db.flagDomainAsFetched("test.com");
|
||||||
|
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to sync domains: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void saveLoadSingle() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
|
||||||
|
var samples = db.getSamples("example.com");
|
||||||
|
assertEquals(1, samples.size());
|
||||||
|
var sample = samples.getFirst();
|
||||||
|
assertEquals("example.com", sample.domain());
|
||||||
|
assertEquals("http://example.com/sample", sample.url());
|
||||||
|
assertEquals("sample data", sample.sample());
|
||||||
|
assertEquals("requests data", sample.requests());
|
||||||
|
assertTrue(sample.acceptedPopover());
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to save/load sample: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void saveLoadTwo() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
|
||||||
|
var samples = db.getSamples("example.com");
|
||||||
|
assertEquals(2, samples.size());
|
||||||
|
|
||||||
|
Map<String, String> samplesByUrl = new HashMap<>();
|
||||||
|
for (var sample : samples) {
|
||||||
|
samplesByUrl.put(sample.url(), sample.sample());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
|
||||||
|
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to save/load sample: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -3,17 +3,21 @@ package nu.marginalia.livecapture;
|
|||||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.testcontainers.containers.GenericContainer;
|
import org.testcontainers.containers.GenericContainer;
|
||||||
|
import org.testcontainers.images.PullPolicy;
|
||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.DockerImageName;
|
import org.testcontainers.utility.DockerImageName;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||||
@@ -22,9 +26,14 @@ import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
|||||||
@Testcontainers
|
@Testcontainers
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
public class BrowserlessClientTest {
|
public class BrowserlessClientTest {
|
||||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
// Run gradle docker if this image is not available
|
||||||
|
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
|
||||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||||
|
.withImagePullPolicy(PullPolicy.defaultPolicy())
|
||||||
.withNetworkMode("bridge")
|
.withNetworkMode("bridge")
|
||||||
|
.withLogConsumer(frame -> {
|
||||||
|
System.out.print(frame.getUtf8String());
|
||||||
|
})
|
||||||
.withExposedPorts(3000);
|
.withExposedPorts(3000);
|
||||||
|
|
||||||
static WireMockServer wireMockServer =
|
static WireMockServer wireMockServer =
|
||||||
@@ -34,6 +43,7 @@ public class BrowserlessClientTest {
|
|||||||
static String localIp;
|
static String localIp;
|
||||||
|
|
||||||
static URI browserlessURI;
|
static URI browserlessURI;
|
||||||
|
static URI browserlessWssURI;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setup() throws IOException {
|
public static void setup() throws IOException {
|
||||||
@@ -44,6 +54,12 @@ public class BrowserlessClientTest {
|
|||||||
container.getMappedPort(3000))
|
container.getMappedPort(3000))
|
||||||
);
|
);
|
||||||
|
|
||||||
|
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
|
||||||
|
container.getHost(),
|
||||||
|
container.getMappedPort(3000))
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
wireMockServer.start();
|
wireMockServer.start();
|
||||||
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||||
|
|
||||||
@@ -85,6 +101,30 @@ public class BrowserlessClientTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAnnotatedContent() throws Exception {
|
||||||
|
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI);
|
||||||
|
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||||
|
) {
|
||||||
|
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||||
|
dbop.saveSample("marginalia.nu", "https://www.thesodacanstove.com/alcohol-stove/how-to-build/", content);
|
||||||
|
System.out.println(content);
|
||||||
|
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||||
|
|
||||||
|
dbop.getSamples("marginalia.nu").forEach(sample -> {
|
||||||
|
System.out.println("Sample URL: " + sample.url());
|
||||||
|
System.out.println("Sample Content: " + sample.sample());
|
||||||
|
System.out.println("Sample Requests: " + sample.requests());
|
||||||
|
System.out.println("Accepted Popover: " + sample.acceptedPopover());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testScreenshot() throws Exception {
|
public void testScreenshot() throws Exception {
|
||||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results;
|
package nu.marginalia.api.searchquery.model.results;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@@ -161,4 +162,14 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
|||||||
public String toString() {
|
public String toString() {
|
||||||
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getShortFormat() {
|
||||||
|
try {
|
||||||
|
var df = DocumentFormat.valueOf(format);
|
||||||
|
return df.shortFormat;
|
||||||
|
}
|
||||||
|
catch (IllegalArgumentException e) {
|
||||||
|
return DocumentFormat.UNKNOWN.shortFormat;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
|
|||||||
|
|
||||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||||
|
|
||||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
|
||||||
for (var instance : journal.pages()) {
|
for (var instance : journal.pages()) {
|
||||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
||||||
{
|
{
|
||||||
|
@@ -62,6 +62,7 @@ dependencies {
|
|||||||
implementation libs.jwarc
|
implementation libs.jwarc
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
|
implementation libs.pdfbox
|
||||||
|
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
@@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.converting.model;
|
package nu.marginalia.converting.model;
|
||||||
|
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
@@ -21,7 +21,7 @@ public class ProcessedDocumentDetails {
|
|||||||
public long hashCode;
|
public long hashCode;
|
||||||
|
|
||||||
public Set<HtmlFeature> features;
|
public Set<HtmlFeature> features;
|
||||||
public HtmlStandard standard;
|
public DocumentFormat format;
|
||||||
|
|
||||||
public List<EdgeUrl> linksInternal;
|
public List<EdgeUrl> linksInternal;
|
||||||
public List<EdgeUrl> linksExternal;
|
public List<EdgeUrl> linksExternal;
|
||||||
@@ -30,6 +30,6 @@ public class ProcessedDocumentDetails {
|
|||||||
public GeneratorType generator;
|
public GeneratorType generator;
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.format + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
|
|||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
||||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||||
|
import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
|
||||||
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@@ -33,7 +34,8 @@ public class DocumentProcessor {
|
|||||||
private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||||
"application/xhtml",
|
"application/xhtml",
|
||||||
"text/html",
|
"text/html",
|
||||||
"text/plain");
|
"text/plain",
|
||||||
|
"application/pdf");
|
||||||
|
|
||||||
|
|
||||||
private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
|
private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
|
||||||
@@ -42,12 +44,14 @@ public class DocumentProcessor {
|
|||||||
@Inject
|
@Inject
|
||||||
public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
|
public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
|
||||||
PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
|
PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
|
||||||
|
PdfDocumentProcessorPlugin pdfDocumentProcessorPlugin,
|
||||||
AnchorTextKeywords anchorTextKeywords)
|
AnchorTextKeywords anchorTextKeywords)
|
||||||
{
|
{
|
||||||
this.anchorTextKeywords = anchorTextKeywords;
|
this.anchorTextKeywords = anchorTextKeywords;
|
||||||
|
|
||||||
processorPlugins.add(htmlDocumentProcessorPlugin);
|
processorPlugins.add(htmlDocumentProcessorPlugin);
|
||||||
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
||||||
|
processorPlugins.add(pdfDocumentProcessorPlugin);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ProcessedDocument process(CrawledDocument crawledDocument,
|
public ProcessedDocument process(CrawledDocument crawledDocument,
|
||||||
|
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.logic;
|
|||||||
|
|
||||||
import crawlercommons.utils.Strings;
|
import crawlercommons.utils.Strings;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@@ -17,7 +17,7 @@ import java.util.Set;
|
|||||||
public class DocumentValuator {
|
public class DocumentValuator {
|
||||||
|
|
||||||
public double getQuality(CrawledDocument crawledDocument,
|
public double getQuality(CrawledDocument crawledDocument,
|
||||||
HtmlStandard htmlStandard,
|
DocumentFormat htmlStandard,
|
||||||
Document parsedDocument,
|
Document parsedDocument,
|
||||||
int textLength) throws DisqualifiedException {
|
int textLength) throws DisqualifiedException {
|
||||||
|
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic;
|
||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.DocumentType;
|
import org.jsoup.nodes.DocumentType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -12,54 +12,54 @@ public class HtmlStandardExtractor {
|
|||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
||||||
|
|
||||||
public static HtmlStandard parseDocType(DocumentType docType) {
|
public static DocumentFormat parseDocType(DocumentType docType) {
|
||||||
if (null == docType) {
|
if (null == docType) {
|
||||||
return HtmlStandard.UNKNOWN;
|
return DocumentFormat.UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
String publicId = docType.publicId();
|
String publicId = docType.publicId();
|
||||||
if (Strings.isNullOrEmpty(publicId))
|
if (Strings.isNullOrEmpty(publicId))
|
||||||
return HtmlStandard.HTML5;
|
return DocumentFormat.HTML5;
|
||||||
|
|
||||||
publicId = publicId.toUpperCase();
|
publicId = publicId.toUpperCase();
|
||||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
}
|
}
|
||||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
}
|
}
|
||||||
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
||||||
return HtmlStandard.XHTML;
|
return DocumentFormat.XHTML;
|
||||||
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
||||||
return HtmlStandard.XHTML;
|
return DocumentFormat.XHTML;
|
||||||
if (publicId.startsWith("-//W3C//DTD HTML"))
|
if (publicId.startsWith("-//W3C//DTD HTML"))
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
|
|
||||||
logger.debug("Unknown publicID standard {}", publicId);
|
logger.debug("Unknown publicID standard {}", publicId);
|
||||||
return HtmlStandard.UNKNOWN;
|
return DocumentFormat.UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HtmlStandard sniffHtmlStandard(Document parsed) {
|
public static DocumentFormat sniffHtmlStandard(Document parsed) {
|
||||||
int html4Attributes = 0;
|
int html4Attributes = 0;
|
||||||
int html5Attributes = 0;
|
int html5Attributes = 0;
|
||||||
|
|
||||||
@@ -73,11 +73,11 @@ public class HtmlStandardExtractor {
|
|||||||
html4Attributes++;
|
html4Attributes++;
|
||||||
}
|
}
|
||||||
if (html5Attributes > 0) {
|
if (html5Attributes > 0) {
|
||||||
return HtmlStandard.HTML5;
|
return DocumentFormat.HTML5;
|
||||||
}
|
}
|
||||||
if (html4Attributes > 0) {
|
if (html4Attributes > 0) {
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
}
|
}
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -7,11 +7,11 @@ import nu.marginalia.keyword.LinkTexts;
|
|||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.filter.LanguageFilter;
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -73,7 +73,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public MetaTagsBuilder addFormat(HtmlStandard standard) {
|
public MetaTagsBuilder addFormat(DocumentFormat standard) {
|
||||||
|
|
||||||
add("format", standard);
|
add("format", standard);
|
||||||
|
|
||||||
|
@@ -25,12 +25,12 @@ import nu.marginalia.language.model.DocumentLanguageData;
|
|||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
import nu.marginalia.link_parser.FeedExtractor;
|
import nu.marginalia.link_parser.FeedExtractor;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@@ -137,8 +137,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
|
|
||||||
final int length = getLength(doc);
|
final int length = getLength(doc);
|
||||||
final HtmlStandard standard = getHtmlStandard(doc);
|
final DocumentFormat format = getDocumentFormat(doc);
|
||||||
final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
|
final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);
|
||||||
|
|
||||||
if (isDisqualified(documentClass, url, quality, doc.title())) {
|
if (isDisqualified(documentClass, url, quality, doc.title())) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||||
@@ -152,7 +152,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
var ret = new ProcessedDocumentDetails();
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
ret.length = length;
|
ret.length = length;
|
||||||
ret.standard = standard;
|
ret.format = format;
|
||||||
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
|
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
|
||||||
|
|
||||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||||
@@ -161,7 +161,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
ret.quality = documentValuator.adjustQuality(quality, features);
|
ret.quality = documentValuator.adjustQuality(quality, features);
|
||||||
ret.hashCode = dld.localitySensitiveHashCode();
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, format, true);
|
||||||
|
|
||||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||||
|
|
||||||
@@ -180,7 +180,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
.addPubDate(pubDate)
|
.addPubDate(pubDate)
|
||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(features)
|
.addFeatures(features)
|
||||||
.addFormat(standard)
|
.addFormat(format)
|
||||||
.addGenerator(generatorParts.keywords())
|
.addGenerator(generatorParts.keywords())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
@@ -316,12 +316,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
return linkTerms;
|
return linkTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
private HtmlStandard getHtmlStandard(Document doc) {
|
private DocumentFormat getDocumentFormat(Document doc) {
|
||||||
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
DocumentFormat format = HtmlStandardExtractor.parseDocType(doc.documentType());
|
||||||
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
if (DocumentFormat.UNKNOWN.equals(format)) {
|
||||||
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
||||||
}
|
}
|
||||||
return htmlStandard;
|
return format;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getLength(Document doc) {
|
private int getLength(Document doc) {
|
||||||
|
@@ -0,0 +1,286 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||||
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||||
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
|
import org.apache.pdfbox.text.HeadingAwarePDFTextStripper;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||||
|
|
||||||
|
private final int maxTitleLength;
|
||||||
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
|
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||||
|
private final DocumentLengthLogic documentLengthLogic;
|
||||||
|
private final DefaultSpecialization defaultSpecialization;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PdfDocumentProcessorPlugin.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||||
|
LanguageFilter languageFilter,
|
||||||
|
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||||
|
DocumentKeywordExtractor keywordExtractor,
|
||||||
|
DocumentLengthLogic documentLengthLogic,
|
||||||
|
DefaultSpecialization defaultSpecialization)
|
||||||
|
|
||||||
|
{
|
||||||
|
super(languageFilter);
|
||||||
|
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||||
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
|
this.maxTitleLength = maxTitleLength;
|
||||||
|
this.keywordExtractor = keywordExtractor;
|
||||||
|
this.defaultSpecialization = defaultSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isApplicable(CrawledDocument doc) {
|
||||||
|
String contentType = doc.contentType.toLowerCase();
|
||||||
|
|
||||||
|
if (contentType.equals("application/pdf"))
|
||||||
|
return true;
|
||||||
|
if (contentType.startsWith("application/pdf;")) // charset=blabla
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||||
|
LinkTexts linkTexts,
|
||||||
|
DocumentClass documentClass)
|
||||||
|
throws DisqualifiedException, URISyntaxException, IOException {
|
||||||
|
|
||||||
|
String documentBody = crawledDocument.documentBody();
|
||||||
|
|
||||||
|
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||||
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||||
|
|
||||||
|
|
||||||
|
Document doc;
|
||||||
|
try {
|
||||||
|
doc = convertPdfToHtml(crawledDocument.documentBodyBytes);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("Failed to convert PDF file {} - {}", url, e.getMessage());
|
||||||
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||||
|
|
||||||
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
|
documentLengthLogic.validateLength(dld, 1.0);
|
||||||
|
|
||||||
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
|
ret.length = documentBody.length();
|
||||||
|
|
||||||
|
ret.format = DocumentFormat.PDF;
|
||||||
|
ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
|
||||||
|
|
||||||
|
ret.quality = -5;
|
||||||
|
|
||||||
|
ret.features = Set.of(HtmlFeature.PDF);
|
||||||
|
ret.description = getDescription(doc);
|
||||||
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
|
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
||||||
|
|
||||||
|
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PdfFile);
|
||||||
|
|
||||||
|
ret.metadata = new DocumentMetadata(
|
||||||
|
documentLengthLogic.getEncodedAverageLength(dld),
|
||||||
|
pubDate.yearByte(),
|
||||||
|
(int) -ret.quality,
|
||||||
|
documentFlags);
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||||
|
|
||||||
|
var tagWords = new MetaTagsBuilder()
|
||||||
|
.addPubDate(pubDate)
|
||||||
|
.addUrl(url)
|
||||||
|
.addFeatures(ret.features)
|
||||||
|
.addFormat(ret.format)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
|
||||||
|
if (pubDate.hasYear()) {
|
||||||
|
ret.pubYear = pubDate.year();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* These are assumed to be populated */
|
||||||
|
ret.linksInternal = new ArrayList<>();
|
||||||
|
ret.linksExternal = new ArrayList<>();
|
||||||
|
|
||||||
|
return new DetailsWithWords(ret, words);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getDescription(Document doc) {
|
||||||
|
int cnt = 0;
|
||||||
|
boolean useNext = false;
|
||||||
|
for (var ptag : doc.getElementsByTag("p")) {
|
||||||
|
String text = ptag.text();
|
||||||
|
|
||||||
|
// Many academic documents have an abstract at the start of the document,
|
||||||
|
// which makes a nice summary. Though they tend to bleed into the text,
|
||||||
|
// so we check for the word "Abstract" at the start of the paragraph.
|
||||||
|
|
||||||
|
if (text.startsWith("Abstract ")) {
|
||||||
|
return StringUtils.abbreviate(text.substring("Abstract ".length()), "...", 255);
|
||||||
|
}
|
||||||
|
else if (text.equals("Abstract")) {
|
||||||
|
useNext = true;
|
||||||
|
}
|
||||||
|
else if (useNext) {
|
||||||
|
return StringUtils.abbreviate(text, "...", 255);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (++cnt > 15) { // Don't scan the entire document
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to the default specialization
|
||||||
|
return defaultSpecialization.getSummary(doc, Set.of());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Convert the provided PDF bytes into a HTML rendering that can be fed
|
||||||
|
* to the HTML processor.
|
||||||
|
*/
|
||||||
|
Document convertPdfToHtml(byte[] pdfBytes) throws IOException {
|
||||||
|
try (var doc = Loader.loadPDF(pdfBytes)) {
|
||||||
|
String docMetaTitle = Objects.requireNonNullElse(doc.getDocumentInformation().getTitle(), "");
|
||||||
|
|
||||||
|
var stripper = new HeadingAwarePDFTextStripper();
|
||||||
|
stripper.setStartPage(1);
|
||||||
|
stripper.setSortByPosition(true);
|
||||||
|
stripper.setWordSeparator(" ");
|
||||||
|
|
||||||
|
// Increase the tolerance for line spacing to deal better with paragraphs.
|
||||||
|
stripper.setDropThreshold(5f);
|
||||||
|
|
||||||
|
stripper.setPageStart("<div>");
|
||||||
|
stripper.setParagraphStart("<p>");
|
||||||
|
stripper.setParagraphEnd("</p>\n");
|
||||||
|
stripper.setPageEnd("</div>\n");
|
||||||
|
stripper.setHeadingStart("<h1>");
|
||||||
|
stripper.setHeadingEnd("</h1>\n");
|
||||||
|
stripper.setLineSeparator("\n");
|
||||||
|
|
||||||
|
String text = stripper.getText(doc);
|
||||||
|
|
||||||
|
StringBuilder htmlBuilder = new StringBuilder(text.length() + 1024);
|
||||||
|
htmlBuilder.append("<html><body>")
|
||||||
|
.append(text)
|
||||||
|
.append("</body></html>");
|
||||||
|
|
||||||
|
var parsed = Jsoup.parse(htmlBuilder.toString());
|
||||||
|
|
||||||
|
repairDOM(parsed);
|
||||||
|
|
||||||
|
for (var heading : parsed.getElementsByTag("h1")) {
|
||||||
|
String headingText = heading.text();
|
||||||
|
if (headingText.length() > 2) {
|
||||||
|
parsed.title(headingText);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (parsed.title().isEmpty()) {
|
||||||
|
// Prefer setting the title to the first paragraph in the
|
||||||
|
// document, as this is almost always correct. Otherwise,
|
||||||
|
// we fall back on the metadata title, which is almost always
|
||||||
|
// useless
|
||||||
|
|
||||||
|
var firstP = parsed.getElementsByTag("p").first();
|
||||||
|
if (firstP != null) parsed.title(firstP.text());
|
||||||
|
else parsed.title(docMetaTitle);
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Repair the DOM to remove some common issues with PDF conversion,
|
||||||
|
* including empty paragraphs, and multiline headers that are split into multiple
|
||||||
|
* conescutive h1 tags.
|
||||||
|
*/
|
||||||
|
private void repairDOM(Document parsed) {
|
||||||
|
|
||||||
|
// <p><h1>...</h1></p> -> <h1>...</h1>
|
||||||
|
parsed.getElementsByTag("h1").forEach(h1 -> {
|
||||||
|
var parent = h1.parent();
|
||||||
|
if (parent == null || !"p".equals(parent.tagName())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parent.childrenSize() == 1) {
|
||||||
|
parent.replaceWith(h1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove empty <p> tags
|
||||||
|
parsed.getElementsByTag("p").forEach(p -> {
|
||||||
|
if (p.childrenSize() == 0 && !p.hasText()) {
|
||||||
|
p.remove();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// <h1>...</h1><h1>...</h1> -> <h1>...</h1>
|
||||||
|
parsed.getElementsByTag("h1").forEach(h1 -> {
|
||||||
|
var nextSibling = h1.nextElementSibling();
|
||||||
|
if (nextSibling == null || !"h1".equals(nextSibling.tagName())) {
|
||||||
|
return; // Short-circuit to avoid unnecessary work
|
||||||
|
}
|
||||||
|
|
||||||
|
StringJoiner joiner = new StringJoiner(" ");
|
||||||
|
joiner.add(h1.text());
|
||||||
|
|
||||||
|
for (var sibling : h1.nextElementSiblings()) {
|
||||||
|
if (!"h1".equals(sibling.tagName()))
|
||||||
|
break;
|
||||||
|
joiner.add(sibling.text());
|
||||||
|
sibling.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
h1.text(joiner.toString());
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -13,10 +13,10 @@ import nu.marginalia.keyword.LinkTexts;
|
|||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.filter.LanguageFilter;
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
@@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
ret.length = documentBody.length();
|
ret.length = documentBody.length();
|
||||||
|
|
||||||
ret.standard = HtmlStandard.PLAIN;
|
ret.format = DocumentFormat.PLAIN;
|
||||||
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
||||||
|
|
||||||
ret.quality = -1;
|
ret.quality = -1;
|
||||||
@@ -113,7 +113,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
.addPubDate(pubDate)
|
.addPubDate(pubDate)
|
||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(ret.features)
|
.addFeatures(ret.features)
|
||||||
.addFormat(ret.standard)
|
.addFormat(ret.format)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
words.addAllSyntheticTerms(tagWords);
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
@@ -1,12 +1,13 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
import nu.marginalia.model.DocumentFormat;
|
||||||
|
|
||||||
public class PubDateFromHtmlStandard {
|
public class PubDateFromHtmlStandard {
|
||||||
/** Used to bias pub date heuristics */
|
/** Used to bias pub date heuristics */
|
||||||
public static int blindGuess(HtmlStandard standard) {
|
public static int blindGuess(DocumentFormat format) {
|
||||||
return switch (standard) {
|
return switch (format) {
|
||||||
case PLAIN -> 1993;
|
case PLAIN -> 1993;
|
||||||
|
case PDF -> 2010;
|
||||||
case HTML123 -> 1997;
|
case HTML123 -> 1997;
|
||||||
case HTML4, XHTML -> 2006;
|
case HTML4, XHTML -> 2006;
|
||||||
case HTML5 -> 2018;
|
case HTML5 -> 2018;
|
||||||
@@ -21,8 +22,8 @@ public class PubDateFromHtmlStandard {
|
|||||||
* Discovering publication year involves a lot of guesswork, this helps
|
* Discovering publication year involves a lot of guesswork, this helps
|
||||||
* keep the guesses relatively sane.
|
* keep the guesses relatively sane.
|
||||||
*/
|
*/
|
||||||
public static boolean isGuessPlausible(HtmlStandard standard, int year) {
|
public static boolean isGuessPlausible(DocumentFormat format, int year) {
|
||||||
switch (standard) {
|
switch (format) {
|
||||||
case HTML123:
|
case HTML123:
|
||||||
return year <= 2000;
|
return year <= 2000;
|
||||||
case XHTML:
|
case XHTML:
|
||||||
|
@@ -1,14 +1,14 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
public interface PubDateHeuristic {
|
public interface PubDateHeuristic {
|
||||||
|
|
||||||
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
|
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard);
|
||||||
}
|
}
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
|
|
||||||
import java.time.DateTimeException;
|
import java.time.DateTimeException;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
@@ -26,7 +26,7 @@ public class PubDateParser {
|
|||||||
.filter(PubDateParser::validateDate);
|
.filter(PubDateParser::validateDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
|
public static Optional<PubDate> attemptParseDate(String date, DocumentFormat standard) {
|
||||||
return Optional.ofNullable(date)
|
return Optional.ofNullable(date)
|
||||||
.filter(str -> str.length() >= 4 && str.length() < 32)
|
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||||
.flatMap(str ->
|
.flatMap(str ->
|
||||||
@@ -81,7 +81,7 @@ public class PubDateParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
|
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, DocumentFormat standard) {
|
||||||
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
||||||
|
|
||||||
var matcher = yearPattern.matcher(maybe);
|
var matcher = yearPattern.matcher(maybe);
|
||||||
@@ -135,7 +135,7 @@ public class PubDateParser {
|
|||||||
return (max + min) / 2;
|
return (max + min) / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int guessYear(HtmlStandard standard) {
|
public static int guessYear(DocumentFormat standard) {
|
||||||
// Create some jitter to avoid having documents piling up in the same four years
|
// Create some jitter to avoid having documents piling up in the same four years
|
||||||
// as this would make searching in those years disproportionately useless
|
// as this would make searching in those years disproportionately useless
|
||||||
|
|
||||||
|
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.pubdate;
|
|||||||
|
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@@ -38,7 +38,7 @@ public class PubDateSniffer {
|
|||||||
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||||
}
|
}
|
||||||
|
|
||||||
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
|
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard, boolean runExpensive) {
|
||||||
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
||||||
|
|
||||||
for (var heuristic : heuristics) {
|
for (var heuristic : heuristics) {
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@@ -19,7 +19,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
|
|
||||||
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final HtmlStandard htmlStandard;
|
private final DocumentFormat htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitorPass(DocumentFormat htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -135,7 +135,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void parse(String text) {
|
private void parse(String text) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
if (htmlStandard == DocumentFormat.UNKNOWN) {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstring(text)
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
||||||
|
@@ -5,9 +5,9 @@ import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Node;
|
import org.jsoup.nodes.Node;
|
||||||
@@ -19,7 +19,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
|
|
||||||
private static class DateExtractingNodeVisitor implements NodeFilter {
|
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final HtmlStandard htmlStandard;
|
private final DocumentFormat htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitor(DocumentFormat htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,7 +73,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void parse(String text) {
|
private void parse(String text) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
if (htmlStandard == DocumentFormat.UNKNOWN) {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstring(text)
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,8 +14,8 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN)
|
if (htmlStandard == DocumentFormat.UNKNOWN)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// HTML5, alternative approach
|
// HTML5, alternative approach
|
||||||
for (var tag : document.select("time")) {
|
for (var tag : document.select("time")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// HTML5
|
// HTML5
|
||||||
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
||||||
|
@@ -8,9 +8,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@@ -21,7 +21,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
||||||
var maybeDate = parseLdJson(tag.data())
|
var maybeDate = parseLdJson(tag.data())
|
||||||
.flatMap(PubDateParser::attemptParseDate);
|
.flatMap(PubDateParser::attemptParseDate);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -15,7 +15,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
List<String> lastModified = headers.get("last-modified");
|
List<String> lastModified = headers.get("last-modified");
|
||||||
if (lastModified.isEmpty())
|
if (lastModified.isEmpty())
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
|
|
||||||
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// OG
|
// OG
|
||||||
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -21,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
|||||||
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
final String urlString = url.path;
|
final String urlString = url.path;
|
||||||
|
|
||||||
var matcher = yearUrlPattern.matcher(urlString);
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -19,7 +19,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
|
||||||
Document document, HtmlStandard htmlStandard) {
|
Document document, DocumentFormat htmlStandard) {
|
||||||
final String urlString = url.path;
|
final String urlString = url.path;
|
||||||
|
|
||||||
var matcher = yearUrlPattern.matcher(urlString);
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
@@ -8,12 +8,12 @@ import nu.marginalia.converting.model.ProcessedDocument;
|
|||||||
import nu.marginalia.converting.processor.DocumentClass;
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
@@ -53,6 +53,7 @@ public class SideloaderProcessing {
|
|||||||
"",
|
"",
|
||||||
body.getBytes(StandardCharsets.UTF_8),
|
body.getBytes(StandardCharsets.UTF_8),
|
||||||
false,
|
false,
|
||||||
|
-1,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
@@ -83,7 +84,7 @@ public class SideloaderProcessing {
|
|||||||
// that we can't get from the sideloaded data since it's
|
// that we can't get from the sideloaded data since it's
|
||||||
// so stripped down
|
// so stripped down
|
||||||
|
|
||||||
ret.details.standard = HtmlStandard.HTML5;
|
ret.details.format = DocumentFormat.HTML5;
|
||||||
ret.details.pubYear = pubYear;
|
ret.details.pubYear = pubYear;
|
||||||
ret.details.features.add(HtmlFeature.JS);
|
ret.details.features.add(HtmlFeature.JS);
|
||||||
ret.details.features.add(HtmlFeature.TRACKING);
|
ret.details.features.add(HtmlFeature.TRACKING);
|
||||||
|
@@ -9,13 +9,13 @@ import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
|||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
@@ -165,7 +165,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
ret.details.description = StringUtils.truncate(doc.body().text(), 255);
|
ret.details.description = StringUtils.truncate(doc.body().text(), 255);
|
||||||
ret.details.length = 128;
|
ret.details.length = 128;
|
||||||
|
|
||||||
ret.details.standard = HtmlStandard.HTML5;
|
ret.details.format = DocumentFormat.HTML5;
|
||||||
ret.details.linksExternal = List.of();
|
ret.details.linksExternal = List.of();
|
||||||
ret.details.linksInternal = List.of();
|
ret.details.linksInternal = List.of();
|
||||||
ret.state = UrlIndexingState.OK;
|
ret.state = UrlIndexingState.OK;
|
||||||
|
@@ -124,7 +124,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
document.details.title,
|
document.details.title,
|
||||||
document.details.description,
|
document.details.description,
|
||||||
HtmlFeature.encode(document.details.features),
|
HtmlFeature.encode(document.details.features),
|
||||||
document.details.standard.name(),
|
document.details.format.name(),
|
||||||
document.details.length,
|
document.details.length,
|
||||||
document.details.hashCode,
|
document.details.hashCode,
|
||||||
(float) document.details.quality,
|
(float) document.details.quality,
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -6,6 +6,7 @@ import com.google.inject.Injector;
|
|||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
@@ -13,7 +14,6 @@ import nu.marginalia.model.crawl.UrlIndexingState;
|
|||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -91,7 +91,7 @@ public class ConvertingIntegrationTest {
|
|||||||
|
|
||||||
assertTrue(details.title.length() > 4);
|
assertTrue(details.title.length() > 4);
|
||||||
assertTrue(details.description.length() > 4);
|
assertTrue(details.description.length() > 4);
|
||||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
assertEquals(DocumentFormat.HTML5, details.format);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -125,7 +125,7 @@ public class ConvertingIntegrationTest {
|
|||||||
assertTrue(details.metadata.size() > 0);
|
assertTrue(details.metadata.size() > 0);
|
||||||
assertTrue(details.title.length() > 4);
|
assertTrue(details.title.length() > 4);
|
||||||
assertTrue(details.description.length() > 4);
|
assertTrue(details.description.length() > 4);
|
||||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
assertEquals(DocumentFormat.HTML5, details.format);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
|
|||||||
"",
|
"",
|
||||||
readClassPathFile(p.toString()).getBytes(),
|
readClassPathFile(p.toString()).getBytes(),
|
||||||
false,
|
false,
|
||||||
|
-1,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
|
@@ -0,0 +1,95 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||||
|
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||||
|
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||||
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
class PdfDocumentProcessorPluginTest {
|
||||||
|
static PdfDocumentProcessorPlugin plugin;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
static void setUpBeforeClass() throws Exception {
|
||||||
|
var lm = WmsaHome.getLanguageModels();
|
||||||
|
plugin = new PdfDocumentProcessorPlugin(255,
|
||||||
|
new LanguageFilter(lm),
|
||||||
|
new ThreadLocalSentenceExtractorProvider(lm),
|
||||||
|
new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
|
||||||
|
new DocumentLengthLogic(100),
|
||||||
|
new DefaultSpecialization(new SummaryExtractor(
|
||||||
|
255,
|
||||||
|
new DomFilterHeuristic(255),
|
||||||
|
new TagDensityHeuristic(255),
|
||||||
|
new OpenGraphDescriptionHeuristic(),
|
||||||
|
new MetaDescriptionHeuristic(),
|
||||||
|
new FallbackHeuristic()
|
||||||
|
),
|
||||||
|
new TitleExtractor(255)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||||
|
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
|
||||||
|
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(Path file) throws Exception {
|
||||||
|
return testPdfFile(Files.readAllBytes(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] downloadPDF(String url) throws IOException, URISyntaxException {
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) new URI(url).toURL().openConnection();
|
||||||
|
try {
|
||||||
|
return conn.getInputStream().readAllBytes();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
} finally {
|
||||||
|
conn.disconnect();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testingTool() throws Exception {
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample2.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample3.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample4.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample5.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample6.pdf")).details().title);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testingTool2() throws Exception {
|
||||||
|
System.out.println(plugin.convertPdfToHtml(Files.readAllBytes(Path.of("/home/st_work/Work/sample6.pdf"))));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testMarginaliaSample() throws Exception {
|
||||||
|
var doc = plugin.convertPdfToHtml(downloadPDF("https://www.marginalia.nu/junk/test.pdf"));
|
||||||
|
System.out.println(doc.html());
|
||||||
|
}
|
||||||
|
}
|
@@ -3,8 +3,8 @@ package nu.marginalia.converting.processor.pubdate;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ class PubDateSnifferTest {
|
|||||||
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -90,7 +90,7 @@ class PubDateSnifferTest {
|
|||||||
<time>2022-08-24</time>
|
<time>2022-08-24</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -106,7 +106,7 @@ class PubDateSnifferTest {
|
|||||||
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2006, ret.year());
|
assertEquals(2006, ret.year());
|
||||||
@@ -116,14 +116,14 @@ class PubDateSnifferTest {
|
|||||||
public void testProblemCases() throws IOException, URISyntaxException {
|
public void testProblemCases() throws IOException, URISyntaxException {
|
||||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||||
new EdgeUrl("https://www.example.com/"),
|
new EdgeUrl("https://www.example.com/"),
|
||||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
|
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), DocumentFormat.HTML5, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2006, ret.year());
|
assertEquals(2006, ret.year());
|
||||||
|
|
||||||
ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||||
new EdgeUrl("https://www.example.com/"),
|
new EdgeUrl("https://www.example.com/"),
|
||||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
|
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), DocumentFormat.XHTML, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2010, ret.year());
|
assertEquals(2010, ret.year());
|
||||||
@@ -146,7 +146,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<meta itemprop="datePublished" content="2022-08-24" />
|
<meta itemprop="datePublished" content="2022-08-24" />
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -160,7 +160,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<meta property="datePublished" content="2022-08-24" />
|
<meta property="datePublished" content="2022-08-24" />
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -174,7 +174,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2004-08-24", ret.dateIso8601());
|
assertEquals("2004-08-24", ret.dateIso8601());
|
||||||
@@ -188,7 +188,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
|
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2016-12-27", ret.dateIso8601());
|
assertEquals("2016-12-27", ret.dateIso8601());
|
||||||
@@ -202,7 +202,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<title>No date in the HTML</title>
|
<title>No date in the HTML</title>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
@@ -217,7 +217,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<title>No date in the HTML</title>
|
<title>No date in the HTML</title>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-02-03", ret.dateIso8601());
|
assertEquals("2022-02-03", ret.dateIso8601());
|
||||||
@@ -232,7 +232,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<p>Published 2003, updated 2022</p>
|
<p>Published 2003, updated 2022</p>
|
||||||
"""), HtmlStandard.HTML5, true);
|
"""), DocumentFormat.HTML5, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
@@ -258,7 +258,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
|
@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
|
|||||||
import java.net.http.HttpHeaders;
|
import java.net.http.HttpHeaders;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
|
|||||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
|
||||||
String headerString = getHeadersAsString(response.getHeaders(), size);
|
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
|
||||||
|
|
||||||
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
|
||||||
StringJoiner joiner = new StringJoiner("\r\n");
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
for (var header : headers) {
|
for (var header : headers) {
|
||||||
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
|
|||||||
if (headerCapitalized.equals("Content-Encoding"))
|
if (headerCapitalized.equals("Content-Encoding"))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
||||||
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||||
// to reflect the actual size of the response body. We'll do this at the end.
|
// to reflect the actual size of the response body. We'll do this at the end.
|
||||||
if (headerCapitalized.equals("Content-Length"))
|
if (headerCapitalized.equals("Content-Length"))
|
||||||
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
|
|||||||
joiner.add(headerCapitalized + ": " + header.getValue());
|
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
|
||||||
joiner.add("Content-Length: " + responseSize);
|
joiner.add("Content-Length: " + responseSize);
|
||||||
|
|
||||||
return joiner.toString();
|
return joiner.toString();
|
||||||
|
@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
Instant date = Instant.now();
|
Instant requestDate = Instant.now();
|
||||||
|
|
||||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||||
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||||
InputStream inputStream = inputBuffer.read()) {
|
InputStream inputStream = inputBuffer.read()) {
|
||||||
|
|
||||||
|
Instant responseDate = Instant.now();
|
||||||
|
|
||||||
cookies.updateCookieStore(response);
|
cookies.updateCookieStore(response);
|
||||||
|
|
||||||
// Build and write the request
|
// Build and write the request
|
||||||
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||||
.blockDigest(requestDigestBuilder.build())
|
.blockDigest(requestDigestBuilder.build())
|
||||||
.date(date)
|
.date(requestDate)
|
||||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
response.addHeader("X-Has-Cookies", 1);
|
response.addHeader("X-Has-Cookies", 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
|
||||||
|
Duration.between(requestDate, responseDate),
|
||||||
|
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||||
|
|
||||||
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||||
.blockDigest(responseDigestBuilder.build())
|
.blockDigest(responseDigestBuilder.build())
|
||||||
.date(date)
|
.date(responseDate)
|
||||||
.concurrentTo(warcRequest.id())
|
.concurrentTo(warcRequest.id())
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
writer.write(warcResponse);
|
writer.write(warcResponse);
|
||||||
|
|
||||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||||
&& inputBuffer.size() < 2048
|
&& inputBuffer.size() < 2048
|
||||||
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||||
{
|
{
|
||||||
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||||
requestUri,
|
requestUri,
|
||||||
Duration.between(date, Instant.now()).getSeconds(),
|
Duration.between(requestDate, Instant.now()).getSeconds(),
|
||||||
inputBuffer.size()
|
inputBuffer.size()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
nextRecord.body,
|
nextRecord.body,
|
||||||
// this field isn't actually used, maybe we can skip calculating it?
|
// this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.cookies,
|
nextRecord.cookies,
|
||||||
|
-1,
|
||||||
lastModified,
|
lastModified,
|
||||||
etag));
|
etag));
|
||||||
}
|
}
|
||||||
|
@@ -166,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
|||||||
nextRecord.body(),
|
nextRecord.body(),
|
||||||
// this field isn't actually used, maybe we can skip calculating it?
|
// this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.cookies(),
|
nextRecord.cookies(),
|
||||||
|
nextRecord.requestTimeMs(),
|
||||||
null,
|
null,
|
||||||
null));
|
null));
|
||||||
}
|
}
|
||||||
|
@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
|
|
||||||
public String crawlerStatus;
|
public String crawlerStatus;
|
||||||
public String crawlerStatusDesc;
|
public String crawlerStatusDesc;
|
||||||
|
public int requestTimeMs;
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public String headers;
|
public String headers;
|
||||||
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
public String lastModifiedMaybe;
|
public String lastModifiedMaybe;
|
||||||
public String etagMaybe;
|
public String etagMaybe;
|
||||||
|
|
||||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
|
||||||
this.crawlId = crawlId;
|
this.crawlId = crawlId;
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.contentType = contentType;
|
this.contentType = contentType;
|
||||||
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
|
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
|
||||||
this.hasCookies = hasCookies;
|
this.hasCookies = hasCookies;
|
||||||
this.lastModifiedMaybe = lastModifiedMaybe;
|
this.lastModifiedMaybe = lastModifiedMaybe;
|
||||||
|
this.requestTimeMs = requestTimeMs;
|
||||||
this.etagMaybe = etagMaybe;
|
this.etagMaybe = etagMaybe;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
private byte[] documentBodyBytes = new byte[0];
|
private byte[] documentBodyBytes = new byte[0];
|
||||||
private String recrawlState;
|
private String recrawlState;
|
||||||
private Boolean hasCookies;
|
private Boolean hasCookies;
|
||||||
|
private int requestTimeMs;
|
||||||
private String lastModifiedMaybe;
|
private String lastModifiedMaybe;
|
||||||
private String etagMaybe;
|
private String etagMaybe;
|
||||||
|
|
||||||
@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
|
||||||
|
this.requestTimeMs = requestTimeMs;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public CrawledDocument build() {
|
public CrawledDocument build() {
|
||||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
|
|||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||||
import nu.marginalia.slop.column.primitive.ByteColumn;
|
import nu.marginalia.slop.column.primitive.ByteColumn;
|
||||||
|
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||||
import nu.marginalia.slop.column.primitive.ShortColumn;
|
import nu.marginalia.slop.column.primitive.ShortColumn;
|
||||||
import nu.marginalia.slop.column.string.EnumColumn;
|
import nu.marginalia.slop.column.string.EnumColumn;
|
||||||
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
long timestamp,
|
long timestamp,
|
||||||
String contentType,
|
String contentType,
|
||||||
byte[] body,
|
byte[] body,
|
||||||
|
int requestTimeMs,
|
||||||
String headers)
|
String headers)
|
||||||
{
|
{
|
||||||
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
|
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||||
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
private static final LongColumn timestampColumn = new LongColumn("timestamp");
|
private static final LongColumn timestampColumn = new LongColumn("timestamp");
|
||||||
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
|
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
|
||||||
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
|
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
|
||||||
|
private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
|
||||||
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
|
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||||
|
|
||||||
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
|
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
|
||||||
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
parquetRecord.timestamp.toEpochMilli(),
|
parquetRecord.timestamp.toEpochMilli(),
|
||||||
parquetRecord.contentType,
|
parquetRecord.contentType,
|
||||||
parquetRecord.body,
|
parquetRecord.body,
|
||||||
|
-1,
|
||||||
parquetRecord.headers
|
parquetRecord.headers
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
date.toEpochMilli(),
|
date.toEpochMilli(),
|
||||||
"x-marginalia/advisory;state=redirect",
|
"x-marginalia/advisory;state=redirect",
|
||||||
new byte[0],
|
new byte[0],
|
||||||
|
-1,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
date.toEpochMilli(),
|
date.toEpochMilli(),
|
||||||
"x-marginalia/advisory;state=error",
|
"x-marginalia/advisory;state=error",
|
||||||
errorStatus.getBytes(),
|
errorStatus.getBytes(),
|
||||||
|
-1,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
date.toEpochMilli(),
|
date.toEpochMilli(),
|
||||||
errorStatus,
|
errorStatus,
|
||||||
new byte[0],
|
new byte[0],
|
||||||
|
-1,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -321,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
private final LongColumn.Writer timestampColumnWriter;
|
private final LongColumn.Writer timestampColumnWriter;
|
||||||
private final EnumColumn.Writer contentTypeColumnWriter;
|
private final EnumColumn.Writer contentTypeColumnWriter;
|
||||||
private final ByteArrayColumn.Writer bodyColumnWriter;
|
private final ByteArrayColumn.Writer bodyColumnWriter;
|
||||||
|
private final ShortColumn.Writer requestTimeColumnWriter;
|
||||||
private final StringColumn.Writer headerColumnWriter;
|
private final StringColumn.Writer headerColumnWriter;
|
||||||
|
|
||||||
public Writer(Path path) throws IOException {
|
public Writer(Path path) throws IOException {
|
||||||
@@ -334,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
timestampColumnWriter = timestampColumn.create(this);
|
timestampColumnWriter = timestampColumn.create(this);
|
||||||
contentTypeColumnWriter = contentTypeColumn.create(this);
|
contentTypeColumnWriter = contentTypeColumn.create(this);
|
||||||
bodyColumnWriter = bodyColumn.create(this);
|
bodyColumnWriter = bodyColumn.create(this);
|
||||||
|
requestTimeColumnWriter = requestTimeColumn.create(this);
|
||||||
headerColumnWriter = headerColumn.create(this);
|
headerColumnWriter = headerColumn.create(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -346,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
timestampColumnWriter.put(record.timestamp);
|
timestampColumnWriter.put(record.timestamp);
|
||||||
contentTypeColumnWriter.put(record.contentType);
|
contentTypeColumnWriter.put(record.contentType);
|
||||||
bodyColumnWriter.put(record.body);
|
bodyColumnWriter.put(record.body);
|
||||||
|
requestTimeColumnWriter.put((short) record.requestTimeMs);
|
||||||
headerColumnWriter.put(record.headers);
|
headerColumnWriter.put(record.headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -391,10 +401,20 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
|
|
||||||
String headersStr;
|
String headersStr;
|
||||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||||
|
int requestTimeMs = -1;
|
||||||
for (var header : headers) {
|
for (var header : headers) {
|
||||||
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
||||||
hasCookies = true;
|
hasCookies = true;
|
||||||
}
|
}
|
||||||
|
if (header.getName().equals("X-Marginalia-Response-Time")) {
|
||||||
|
try {
|
||||||
|
requestTimeMs = Integer.parseInt(header.getValue());
|
||||||
|
}
|
||||||
|
catch (NumberFormatException ex) {
|
||||||
|
logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||||
}
|
}
|
||||||
headersStr = headersStrBuilder.toString();
|
headersStr = headersStrBuilder.toString();
|
||||||
@@ -409,6 +429,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
response.date().toEpochMilli(),
|
response.date().toEpochMilli(),
|
||||||
contentType,
|
contentType,
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
|
requestTimeMs,
|
||||||
headersStr
|
headersStr
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
@@ -461,6 +482,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
private final LongColumn.Reader timestampColumnReader;
|
private final LongColumn.Reader timestampColumnReader;
|
||||||
private final EnumColumn.Reader contentTypeColumnReader;
|
private final EnumColumn.Reader contentTypeColumnReader;
|
||||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||||
|
private final ShortColumn.Reader requestTimeColumnReader;
|
||||||
private final StringColumn.Reader headerColumnReader;
|
private final StringColumn.Reader headerColumnReader;
|
||||||
|
|
||||||
public Reader(Path path) throws IOException {
|
public Reader(Path path) throws IOException {
|
||||||
@@ -475,6 +497,17 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||||
bodyColumnReader = bodyColumn.open(this);
|
bodyColumnReader = bodyColumn.open(this);
|
||||||
headerColumnReader = headerColumn.open(this);
|
headerColumnReader = headerColumn.open(this);
|
||||||
|
|
||||||
|
// FIXME: After 2025-06-XX, we can remove this migration workaround
|
||||||
|
ShortColumn.Reader timeColumnReader;
|
||||||
|
try {
|
||||||
|
timeColumnReader = requestTimeColumn.open(this);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
// Migration workaround
|
||||||
|
timeColumnReader = null;
|
||||||
|
}
|
||||||
|
requestTimeColumnReader = timeColumnReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SlopCrawlDataRecord get() throws IOException {
|
public SlopCrawlDataRecord get() throws IOException {
|
||||||
@@ -487,6 +520,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
timestampColumnReader.get(),
|
timestampColumnReader.get(),
|
||||||
contentTypeColumnReader.get(),
|
contentTypeColumnReader.get(),
|
||||||
bodyColumnReader.get(),
|
bodyColumnReader.get(),
|
||||||
|
requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1,
|
||||||
headerColumnReader.get()
|
headerColumnReader.get()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -506,6 +540,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
private final LongColumn.Reader timestampColumnReader;
|
private final LongColumn.Reader timestampColumnReader;
|
||||||
private final EnumColumn.Reader contentTypeColumnReader;
|
private final EnumColumn.Reader contentTypeColumnReader;
|
||||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||||
|
private final ShortColumn.Reader requestTimeColumnReader;
|
||||||
private final StringColumn.Reader headerColumnReader;
|
private final StringColumn.Reader headerColumnReader;
|
||||||
|
|
||||||
private SlopCrawlDataRecord next = null;
|
private SlopCrawlDataRecord next = null;
|
||||||
@@ -522,6 +557,17 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||||
bodyColumnReader = bodyColumn.open(this);
|
bodyColumnReader = bodyColumn.open(this);
|
||||||
headerColumnReader = headerColumn.open(this);
|
headerColumnReader = headerColumn.open(this);
|
||||||
|
|
||||||
|
// FIXME: After 2025-06-XX, we can remove this migration workaround
|
||||||
|
ShortColumn.Reader timeColumnReader;
|
||||||
|
try {
|
||||||
|
timeColumnReader = requestTimeColumn.open(this);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
// Migration workaround
|
||||||
|
timeColumnReader = null;
|
||||||
|
}
|
||||||
|
requestTimeColumnReader = timeColumnReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
public abstract boolean filter(String url, int status, String contentType);
|
public abstract boolean filter(String url, int status, String contentType);
|
||||||
@@ -548,6 +594,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
boolean cookies = cookiesColumnReader.get() == 1;
|
boolean cookies = cookiesColumnReader.get() == 1;
|
||||||
int status = statusColumnReader.get();
|
int status = statusColumnReader.get();
|
||||||
long timestamp = timestampColumnReader.get();
|
long timestamp = timestampColumnReader.get();
|
||||||
|
int requestTimeMs = requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1;
|
||||||
String contentType = contentTypeColumnReader.get();
|
String contentType = contentTypeColumnReader.get();
|
||||||
|
|
||||||
LargeItem<byte[]> body = bodyColumnReader.getLarge();
|
LargeItem<byte[]> body = bodyColumnReader.getLarge();
|
||||||
@@ -555,7 +602,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
|
|
||||||
if (filter(url, status, contentType)) {
|
if (filter(url, status, contentType)) {
|
||||||
next = new SlopCrawlDataRecord(
|
next = new SlopCrawlDataRecord(
|
||||||
domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get()
|
domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
|
||||||
);
|
);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
|||||||
headers,
|
headers,
|
||||||
body,
|
body,
|
||||||
false,
|
false,
|
||||||
|
-1,
|
||||||
"",
|
"",
|
||||||
""
|
""
|
||||||
));
|
));
|
||||||
|
@@ -90,6 +90,7 @@ public class ApiSearchOperator {
|
|||||||
url.getTitle(),
|
url.getTitle(),
|
||||||
url.getDescription(),
|
url.getDescription(),
|
||||||
sanitizeNaN(url.rankingScore, -100),
|
sanitizeNaN(url.rankingScore, -100),
|
||||||
|
url.getShortFormat(),
|
||||||
details
|
details
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -8,14 +8,16 @@ public class ApiSearchResult {
|
|||||||
public String title;
|
public String title;
|
||||||
public String description;
|
public String description;
|
||||||
public double quality;
|
public double quality;
|
||||||
|
public String format; // "pdf", "html", "text", etc.
|
||||||
|
|
||||||
public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||||
|
|
||||||
public ApiSearchResult(String url, String title, String description, double quality, List<List<ApiSearchResultQueryDetails>> details) {
|
public ApiSearchResult(String url, String title, String description, double quality, String format, List<List<ApiSearchResultQueryDetails>> details) {
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.title = title;
|
this.title = title;
|
||||||
this.description = description;
|
this.description = description;
|
||||||
this.quality = quality;
|
this.quality = quality;
|
||||||
|
this.format = format;
|
||||||
this.details = details;
|
this.details = details;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -73,6 +73,8 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
|||||||
return "HTML 5";
|
return "HTML 5";
|
||||||
case "PLAIN":
|
case "PLAIN":
|
||||||
return "Plain Text";
|
return "Plain Text";
|
||||||
|
case "PDF":
|
||||||
|
return "PDF";
|
||||||
default:
|
default:
|
||||||
return "?";
|
return "?";
|
||||||
}
|
}
|
||||||
|
@@ -61,7 +61,7 @@ public class UrlDeduplicator {
|
|||||||
|
|
||||||
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
||||||
final var domain = details.getUrl().getDomain();
|
final var domain = details.getUrl().getDomain();
|
||||||
final String key = domain.getDomainKey();
|
final String key = domain.toString();
|
||||||
|
|
||||||
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
|
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
|
||||||
}
|
}
|
||||||
|
@@ -17,7 +17,7 @@ public class SearchQueryParamFactory {
|
|||||||
static final RpcQueryLimits defaultLimits = RpcQueryLimits.newBuilder()
|
static final RpcQueryLimits defaultLimits = RpcQueryLimits.newBuilder()
|
||||||
.setResultsTotal(100)
|
.setResultsTotal(100)
|
||||||
.setResultsByDomain(5)
|
.setResultsByDomain(5)
|
||||||
.setTimeoutMs(200)
|
.setTimeoutMs(250)
|
||||||
.setFetchSize(8192)
|
.setFetchSize(8192)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
@@ -23,7 +23,7 @@ public class SearchResultClusterer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** No clustering, just return the results as is */
|
/** No clustering, just return the results as is */
|
||||||
private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
|
public static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
|
||||||
if (results.isEmpty())
|
if (results.isEmpty())
|
||||||
return List.of();
|
return List.of();
|
||||||
|
|
||||||
|
@@ -85,7 +85,6 @@ public class SearchService extends JoobyService {
|
|||||||
String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
|
String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
|
||||||
jooby.get("/site/{domain}/favicon", ctx -> {
|
jooby.get("/site/{domain}/favicon", ctx -> {
|
||||||
String domain = ctx.path("domain").value();
|
String domain = ctx.path("domain").value();
|
||||||
logger.info("Finding icon for domain {}", domain);
|
|
||||||
try {
|
try {
|
||||||
DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
|
DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
|
||||||
var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());
|
var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());
|
||||||
|
@@ -78,6 +78,8 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
|||||||
return "HTML 5";
|
return "HTML 5";
|
||||||
case "PLAIN":
|
case "PLAIN":
|
||||||
return "Plain Text";
|
return "Plain Text";
|
||||||
|
case "PDF":
|
||||||
|
return "PDF";
|
||||||
default:
|
default:
|
||||||
return "?";
|
return "?";
|
||||||
}
|
}
|
||||||
@@ -92,13 +94,24 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
|||||||
public String displayTitle() {
|
public String displayTitle() {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
buildDisplayTitle(sb, title);
|
||||||
|
|
||||||
|
if (sb.isEmpty()) {
|
||||||
|
buildDisplayTitle(sb, url.toDisplayString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void buildDisplayTitle(StringBuilder sb, String str) {
|
||||||
|
|
||||||
int distSinceBreak = 0;
|
int distSinceBreak = 0;
|
||||||
|
|
||||||
char c = ' ';
|
char c = ' ';
|
||||||
int prevC = ' ';
|
int prevC = ' ';
|
||||||
for (int i = 0; i < title.length(); i++) {
|
for (int i = 0; i < str.length(); i++) {
|
||||||
prevC = c;
|
prevC = c;
|
||||||
c = title.charAt(i);
|
c = str.charAt(i);
|
||||||
|
|
||||||
if (Character.isSpaceChar(c)) {
|
if (Character.isSpaceChar(c)) {
|
||||||
distSinceBreak = 0;
|
distSinceBreak = 0;
|
||||||
@@ -135,8 +148,6 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
|||||||
sb.append(c);
|
sb.append(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Helper that inserts hyphenation hints and escapes
|
/** Helper that inserts hyphenation hints and escapes
|
||||||
|
@@ -25,6 +25,7 @@ public class UrlDeduplicator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean shouldRemove(DecoratedSearchResultItem details) {
|
public boolean shouldRemove(DecoratedSearchResultItem details) {
|
||||||
|
|
||||||
if (!deduplicateOnSuperficialHash(details))
|
if (!deduplicateOnSuperficialHash(details))
|
||||||
return true;
|
return true;
|
||||||
if (!deduplicateOnLSH(details))
|
if (!deduplicateOnLSH(details))
|
||||||
@@ -61,7 +62,7 @@ public class UrlDeduplicator {
|
|||||||
|
|
||||||
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
||||||
final var domain = details.getUrl().getDomain();
|
final var domain = details.getUrl().getDomain();
|
||||||
final String key = domain.getDomainKey();
|
final String key = domain.toString();
|
||||||
|
|
||||||
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
|
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
|
||||||
}
|
}
|
||||||
|
@@ -21,6 +21,9 @@
|
|||||||
</h2>
|
</h2>
|
||||||
|
|
||||||
<div class="text-sm mt-1">
|
<div class="text-sm mt-1">
|
||||||
|
@if ("PDF".equals(result.first.format))
|
||||||
|
<i title="PDF" class="fas fa-file-pdf text-red-500"></i>
|
||||||
|
@endif
|
||||||
<a class="text-liteblue dark:text-blue-200 underline break-all" href="${result.first.url.toString()}"
|
<a class="text-liteblue dark:text-blue-200 underline break-all" href="${result.first.url.toString()}"
|
||||||
rel="noopener noreferrer" tabindex="-1">$unsafe{result.first.displayUrl()}</a>
|
rel="noopener noreferrer" tabindex="-1">$unsafe{result.first.displayUrl()}</a>
|
||||||
</div>
|
</div>
|
||||||
@@ -53,10 +56,13 @@
|
|||||||
<div class="flex mt-2 text-sm flex flex-col space-y-2">
|
<div class="flex mt-2 text-sm flex flex-col space-y-2">
|
||||||
<p class="text-black dark:text-white ${result.colorScheme.backgroundColor} p-1 rounded break-words hyphens-auto">Also from ${result.getDomain().toString()}:</p>
|
<p class="text-black dark:text-white ${result.colorScheme.backgroundColor} p-1 rounded break-words hyphens-auto">Also from ${result.getDomain().toString()}:</p>
|
||||||
|
|
||||||
<ul class="pl-2 mt-2 underline text-liteblue dark:text-blue-200">
|
<ul class="pl-2 mt-2 text-liteblue dark:text-blue-200">
|
||||||
@for(UrlDetails item : result.rest)
|
@for(UrlDetails item : result.rest)
|
||||||
<li class="-indent-4 pl-4 mb-1 break-words hyphens-auto">
|
<li class="-indent-4 pl-4 mb-1 break-words hyphens-auto">
|
||||||
<a href="${item.url.toString()}" rel="noopener noreferrer">$unsafe{item.displayTitle()}</a>
|
@if ("PDF".equals(item.format))
|
||||||
|
<i title="PDF" class="fas fa-file-pdf text-red-500"></i>
|
||||||
|
@endif
|
||||||
|
<a href="${item.url.toString()}" class="underline" rel="noopener noreferrer">$unsafe{item.displayTitle()}</a>
|
||||||
</li>
|
</li>
|
||||||
@endfor
|
@endfor
|
||||||
</ul>
|
</ul>
|
||||||
@@ -74,6 +80,9 @@
|
|||||||
@if (DocumentFlags.PlainText.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
@if (DocumentFlags.PlainText.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
||||||
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">Plain text</span>
|
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">Plain text</span>
|
||||||
@endif
|
@endif
|
||||||
|
@if (DocumentFlags.PdfFile.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
||||||
|
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">PDF File</span>
|
||||||
|
@endif
|
||||||
@if (DocumentFlags.GeneratorForum.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
@if (DocumentFlags.GeneratorForum.isPresent(result.getFirst().resultItem.encodedDocMetadata))
|
||||||
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">Forum</span>
|
<span class="px-1 bg-blue-100 text-blue-700 dark:border dark:border-blue-600 dark:text-blue-400 dark:bg-black rounded">Forum</span>
|
||||||
@endif
|
@endif
|
||||||
|
@@ -5,6 +5,7 @@ import com.google.inject.Inject;
|
|||||||
import io.jooby.Context;
|
import io.jooby.Context;
|
||||||
import io.jooby.Jooby;
|
import io.jooby.Jooby;
|
||||||
import nu.marginalia.assistant.suggest.Suggestions;
|
import nu.marginalia.assistant.suggest.Suggestions;
|
||||||
|
import nu.marginalia.domsample.DomSampleService;
|
||||||
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
||||||
import nu.marginalia.functions.math.MathGrpcService;
|
import nu.marginalia.functions.math.MathGrpcService;
|
||||||
import nu.marginalia.livecapture.LiveCaptureGrpcService;
|
import nu.marginalia.livecapture.LiveCaptureGrpcService;
|
||||||
@@ -30,6 +31,7 @@ public class AssistantService extends JoobyService {
|
|||||||
ScreenshotService screenshotService,
|
ScreenshotService screenshotService,
|
||||||
DomainInfoGrpcService domainInfoGrpcService,
|
DomainInfoGrpcService domainInfoGrpcService,
|
||||||
LiveCaptureGrpcService liveCaptureGrpcService,
|
LiveCaptureGrpcService liveCaptureGrpcService,
|
||||||
|
DomSampleService domSampleService,
|
||||||
FeedsGrpcService feedsGrpcService,
|
FeedsGrpcService feedsGrpcService,
|
||||||
MathGrpcService mathGrpcService,
|
MathGrpcService mathGrpcService,
|
||||||
Suggestions suggestions)
|
Suggestions suggestions)
|
||||||
@@ -41,10 +43,11 @@ public class AssistantService extends JoobyService {
|
|||||||
liveCaptureGrpcService,
|
liveCaptureGrpcService,
|
||||||
feedsGrpcService),
|
feedsGrpcService),
|
||||||
List.of());
|
List.of());
|
||||||
this.screenshotService = screenshotService;
|
|
||||||
|
|
||||||
|
this.screenshotService = screenshotService;
|
||||||
this.suggestions = suggestions;
|
this.suggestions = suggestions;
|
||||||
|
|
||||||
|
domSampleService.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void startJooby(Jooby jooby) {
|
public void startJooby(Jooby jooby) {
|
||||||
|
3
code/tools/browserless/Dockerfile
Normal file
3
code/tools/browserless/Dockerfile
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
FROM ghcr.io/browserless/chromium:latest
|
||||||
|
|
||||||
|
COPY extension/ /dom-export
|
45
code/tools/browserless/build.gradle
Normal file
45
code/tools/browserless/build.gradle
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
plugins {
|
||||||
|
id 'base'
|
||||||
|
}
|
||||||
|
|
||||||
|
def imageName = 'marginalia-browserless'
|
||||||
|
def imageTag = project.hasProperty('imageTag') ? project.getProperty('imageTag') : 'latest'
|
||||||
|
|
||||||
|
tasks.register('docker', Exec) {
|
||||||
|
group = 'Docker'
|
||||||
|
description = 'Builds a Docker image using the Dockerfile in project root'
|
||||||
|
|
||||||
|
workingDir = projectDir
|
||||||
|
|
||||||
|
// Build the Docker command
|
||||||
|
commandLine 'docker', 'build',
|
||||||
|
'-t', "${imageName}:${imageTag}",
|
||||||
|
'-f', 'Dockerfile',
|
||||||
|
'--pull',
|
||||||
|
'--build-arg', "BASE_DIR=.",
|
||||||
|
'.'
|
||||||
|
|
||||||
|
// Add optional parameters if specified
|
||||||
|
if (project.hasProperty('noCache') && project.getProperty('noCache').toBoolean()) {
|
||||||
|
commandLine += '--no-cache'
|
||||||
|
}
|
||||||
|
|
||||||
|
doFirst {
|
||||||
|
println "Building Docker image '${imageName}:${imageTag}'..."
|
||||||
|
}
|
||||||
|
|
||||||
|
doLast {
|
||||||
|
println "Docker image '${imageName}:${imageTag}' has been built successfully."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add task to ensure the extension folder is included in the Docker context
|
||||||
|
tasks.register('prepareExtension', Copy) {
|
||||||
|
from 'extension'
|
||||||
|
into "${buildDir}/docker/extension"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make the docker task depend on prepareExtension
|
||||||
|
tasks.named('docker').configure {
|
||||||
|
dependsOn 'prepareExtension'
|
||||||
|
}
|
32
code/tools/browserless/extension/background.js
Normal file
32
code/tools/browserless/extension/background.js
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
|
||||||
|
// Listen to web requests and buffer them until the content script is ready
|
||||||
|
|
||||||
|
chrome.webRequest.onBeforeRequest.addListener(
|
||||||
|
(details) => {
|
||||||
|
const requestData = {
|
||||||
|
url: details.url,
|
||||||
|
method: details.method,
|
||||||
|
timestamp: Date.now()
|
||||||
|
};
|
||||||
|
console.log(requestData);
|
||||||
|
|
||||||
|
chrome.tabs.sendMessage(details.tabId, {
|
||||||
|
type: 'URL_INTERCEPTED',
|
||||||
|
...requestData
|
||||||
|
});
|
||||||
|
},
|
||||||
|
{ urls: ["<all_urls>"] }
|
||||||
|
);
|
||||||
|
|
||||||
|
// Listen to web navigation events and re-register content scripts when a page is reloaded or navigated to a new subframe
|
||||||
|
|
||||||
|
chrome.webNavigation.onCommitted.addListener(function(details) {
|
||||||
|
if (details.transitionType === 'reload' || details.transitionType === 'auto_subframe') {
|
||||||
|
chrome.scripting.registerContentScripts([{
|
||||||
|
id: "content-script",
|
||||||
|
matches : [ "<all_urls>" ],
|
||||||
|
js : [ "content.js" ]
|
||||||
|
}]);
|
||||||
|
}
|
||||||
|
});
|
646
code/tools/browserless/extension/content.js
Normal file
646
code/tools/browserless/extension/content.js
Normal file
@@ -0,0 +1,646 @@
|
|||||||
|
// This script runs in the context of web pages loaded by the browser extension
|
||||||
|
|
||||||
|
// Listen to messages from the background script
|
||||||
|
var networkRequests = document.createElement('div')
|
||||||
|
networkRequests.setAttribute('id', 'marginalia-network-requests');
|
||||||
|
|
||||||
|
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
|
||||||
|
if (message.type === 'URL_INTERCEPTED') {
|
||||||
|
var request = document.createElement('div');
|
||||||
|
request.setAttribute('class', 'network-request');
|
||||||
|
request.setAttribute('data-url', message.url);
|
||||||
|
request.setAttribute('data-method', message.method);
|
||||||
|
request.setAttribute('data-timestamp', message.timestamp);
|
||||||
|
networkRequests.appendChild(request)
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Function to add styles as data attributes based on specified properties
|
||||||
|
|
||||||
|
function addStylesAsDataAttributes(propertyToAttrMap = {
|
||||||
|
'display': 'data-display',
|
||||||
|
'position': 'data-position',
|
||||||
|
'visibility': 'data-visibility',
|
||||||
|
}) {
|
||||||
|
const targetedProperties = new Set(Object.keys(propertyToAttrMap).map(prop => prop.toLowerCase()));
|
||||||
|
const styleSheets = Array.from(document.styleSheets);
|
||||||
|
|
||||||
|
try {
|
||||||
|
styleSheets.forEach(styleSheet => {
|
||||||
|
try {
|
||||||
|
if (styleSheet.href && new URL(styleSheet.href).origin !== window.location.origin) {
|
||||||
|
console.warn(`Skipping cross-origin stylesheet: ${styleSheet.href}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const cssRules = styleSheet.cssRules || styleSheet.rules;
|
||||||
|
if (!cssRules) return;
|
||||||
|
for (let i = 0; i < cssRules.length; i++) {
|
||||||
|
const rule = cssRules[i];
|
||||||
|
if (rule.type === 1) {
|
||||||
|
try {
|
||||||
|
let containsTargetedProperty = false;
|
||||||
|
for (let j = 0; j < rule.style.length; j++) {
|
||||||
|
const property = rule.style[j].toLowerCase();
|
||||||
|
if (targetedProperties.has(property)) {
|
||||||
|
containsTargetedProperty = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!containsTargetedProperty) continue;
|
||||||
|
const elements = document.querySelectorAll(rule.selectorText);
|
||||||
|
elements.forEach(element => {
|
||||||
|
for (let j = 0; j < rule.style.length; j++) {
|
||||||
|
const property = rule.style[j].toLowerCase();
|
||||||
|
if (targetedProperties.has(property)) {
|
||||||
|
const value = rule.style.getPropertyValue(property);
|
||||||
|
const dataAttrName = propertyToAttrMap[property];
|
||||||
|
element.setAttribute(dataAttrName, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (selectorError) {
|
||||||
|
console.error(`Error processing selector "${rule.selectorText}": ${selectorError.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (sheetError) {
|
||||||
|
console.error(`Error processing stylesheet: ${sheetError.message}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error adding data attributes: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CookieConsentHandler {
|
||||||
|
constructor() {
|
||||||
|
// Keywords that strongly indicate cookie consent
|
||||||
|
this.cookieKeywords = [
|
||||||
|
'cookie', 'cookies', 'consent', 'gdpr', 'privacy policy', 'privacy notice',
|
||||||
|
'data protection', 'tracking', 'analytics', 'personalization', 'advertising',
|
||||||
|
'essential cookies', 'functional cookies', 'performance cookies'
|
||||||
|
];
|
||||||
|
|
||||||
|
// Keywords that indicate newsletter/subscription popups
|
||||||
|
this.newsletterKeywords = [
|
||||||
|
'newsletter', 'subscribe', 'email', 'signup', 'sign up', 'updates',
|
||||||
|
'notifications', 'discount', 'offer', 'deal', 'promo', 'exclusive'
|
||||||
|
];
|
||||||
|
|
||||||
|
// Common button text for accepting cookies
|
||||||
|
this.acceptButtonTexts = [
|
||||||
|
'accept', 'accept all', 'allow all', 'agree', 'ok', 'got it',
|
||||||
|
'i agree', 'continue', 'yes', 'enable', 'allow cookies',
|
||||||
|
'accept cookies', 'accept all cookies', 'i understand'
|
||||||
|
];
|
||||||
|
|
||||||
|
// Common button text for rejecting (to avoid clicking these)
|
||||||
|
this.rejectButtonTexts = [
|
||||||
|
'reject', 'decline', 'deny', 'refuse', 'no thanks', 'no',
|
||||||
|
'reject all', 'decline all', 'manage preferences', 'customize',
|
||||||
|
'settings', 'options', 'learn more'
|
||||||
|
];
|
||||||
|
|
||||||
|
// Special patterns that strongly indicate cookie consent
|
||||||
|
this.acceptButtonStyles = [
|
||||||
|
/primary/,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
analyzePopover(element) {
|
||||||
|
if (!element || !element.textContent) {
|
||||||
|
return { category: 'unknown', action: 'none', reason: 'Invalid element' };
|
||||||
|
}
|
||||||
|
|
||||||
|
const textContent = element.textContent.toLowerCase();
|
||||||
|
const category = this.categorizePopover(textContent, element);
|
||||||
|
|
||||||
|
let result = {
|
||||||
|
category: category,
|
||||||
|
action: 'none',
|
||||||
|
reason: '',
|
||||||
|
element: element
|
||||||
|
};
|
||||||
|
|
||||||
|
if (category === 'cookie_consent') {
|
||||||
|
const acceptResult = this.tryAcceptCookies(element);
|
||||||
|
result.action = acceptResult.action;
|
||||||
|
result.reason = acceptResult.reason;
|
||||||
|
result.buttonClicked = acceptResult.buttonClicked;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
categorizePopover(textContent, element) {
|
||||||
|
let cookieScore = 0;
|
||||||
|
let newsletterScore = 0;
|
||||||
|
|
||||||
|
// Score based on keyword presence
|
||||||
|
this.cookieKeywords.forEach(keyword => {
|
||||||
|
if (textContent.includes(keyword)) {
|
||||||
|
cookieScore += keyword === 'cookie' || keyword === 'cookies' ? 3 : 1;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
this.newsletterKeywords.forEach(keyword => {
|
||||||
|
if (textContent.includes(keyword)) {
|
||||||
|
newsletterScore += keyword === 'newsletter' || keyword === 'subscribe' ? 3 : 1;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Additional heuristics
|
||||||
|
if (this.hasPrivacyPolicyLink(element)) cookieScore += 2;
|
||||||
|
if (this.hasManagePreferencesButton(element)) cookieScore += 2;
|
||||||
|
if (this.hasEmailInput(element)) newsletterScore += 3;
|
||||||
|
if (this.hasDiscountMention(textContent)) newsletterScore += 2;
|
||||||
|
|
||||||
|
// Special patterns that strongly indicate cookie consent
|
||||||
|
const strongCookiePatterns = [
|
||||||
|
/we use cookies/,
|
||||||
|
/this website uses cookies/,
|
||||||
|
/by continuing to use/,
|
||||||
|
/essential.*cookies/,
|
||||||
|
/improve.*experience/,
|
||||||
|
/gdpr/,
|
||||||
|
/data protection/
|
||||||
|
];
|
||||||
|
|
||||||
|
if (strongCookiePatterns.some(pattern => pattern.test(textContent))) {
|
||||||
|
cookieScore += 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine category
|
||||||
|
if (cookieScore > newsletterScore && cookieScore >= 2) {
|
||||||
|
return 'cookie_consent';
|
||||||
|
} else if (newsletterScore > cookieScore && newsletterScore >= 2) {
|
||||||
|
return 'newsletter';
|
||||||
|
} else {
|
||||||
|
return 'other';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tryAcceptCookies(element) {
|
||||||
|
const buttons = this.findButtons(element);
|
||||||
|
|
||||||
|
if (buttons.length === 0) {
|
||||||
|
return { action: 'no_buttons_found', reason: 'No clickable buttons found' };
|
||||||
|
}
|
||||||
|
|
||||||
|
// First, try to find explicit accept buttons
|
||||||
|
const acceptButton = this.findAcceptButton(buttons);
|
||||||
|
if (acceptButton) {
|
||||||
|
try {
|
||||||
|
acceptButton.click();
|
||||||
|
return {
|
||||||
|
action: 'clicked_accept',
|
||||||
|
reason: 'Found and clicked accept button',
|
||||||
|
buttonClicked: acceptButton.textContent.trim()
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
action: 'click_failed',
|
||||||
|
reason: `Failed to click button: ${error.message}`,
|
||||||
|
buttonClicked: acceptButton.textContent.trim()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no explicit accept button, try to find the most likely candidate
|
||||||
|
const likelyButton = this.findMostLikelyAcceptButton(buttons);
|
||||||
|
if (likelyButton) {
|
||||||
|
try {
|
||||||
|
likelyButton.click();
|
||||||
|
return {
|
||||||
|
action: 'clicked_likely',
|
||||||
|
reason: 'Clicked most likely accept button',
|
||||||
|
buttonClicked: likelyButton.textContent.trim()
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
action: 'click_failed',
|
||||||
|
reason: `Failed to click button: ${error.message}`,
|
||||||
|
buttonClicked: likelyButton.textContent.trim()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
action: 'no_accept_button',
|
||||||
|
reason: 'Could not identify accept button',
|
||||||
|
availableButtons: buttons.map(btn => btn.textContent.trim())
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
findButtons(element) {
|
||||||
|
const selectors = [
|
||||||
|
'button',
|
||||||
|
'input[type="button"]',
|
||||||
|
'input[type="submit"]',
|
||||||
|
'[role="button"]',
|
||||||
|
'a[href="#"]',
|
||||||
|
'.button',
|
||||||
|
'.btn',
|
||||||
|
'.btn-primary'
|
||||||
|
];
|
||||||
|
|
||||||
|
const buttons = [];
|
||||||
|
selectors.forEach(selector => {
|
||||||
|
const found = element.querySelectorAll(selector);
|
||||||
|
buttons.push(...Array.from(found));
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove duplicates and filter visible buttons
|
||||||
|
return [...new Set(buttons)].filter(btn =>
|
||||||
|
btn.offsetWidth > 0 && btn.offsetHeight > 0
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
findAcceptButton(buttons) {
|
||||||
|
var byClass = buttons.find(button => {
|
||||||
|
var classes = button.className.toLowerCase();
|
||||||
|
|
||||||
|
if (this.acceptButtonStyles.some(pattern => pattern.test(classes))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (byClass != null) {
|
||||||
|
return byClass;
|
||||||
|
}
|
||||||
|
|
||||||
|
return buttons.find(button => {
|
||||||
|
const text = button.textContent.toLowerCase().trim();
|
||||||
|
|
||||||
|
return this.acceptButtonTexts.some(acceptText =>
|
||||||
|
text === acceptText || text.includes(acceptText)
|
||||||
|
) && !this.rejectButtonTexts.some(rejectText =>
|
||||||
|
text.includes(rejectText)
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
findMostLikelyAcceptButton(buttons) {
|
||||||
|
if (buttons.length === 1) {
|
||||||
|
const text = buttons[0].textContent.toLowerCase();
|
||||||
|
// If there's only one button and it's not explicitly a reject button, assume it's accept
|
||||||
|
if (!this.rejectButtonTexts.some(rejectText => text.includes(rejectText))) {
|
||||||
|
return buttons[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for buttons with positive styling (often green, primary, etc.)
|
||||||
|
const positiveButton = buttons.find(button => {
|
||||||
|
const classes = button.className.toLowerCase();
|
||||||
|
const styles = window.getComputedStyle(button);
|
||||||
|
const bgColor = styles.backgroundColor;
|
||||||
|
|
||||||
|
return classes.includes('primary') ||
|
||||||
|
classes.includes('accept') ||
|
||||||
|
classes.includes('green') ||
|
||||||
|
bgColor.includes('rgb(0, 128, 0)') || // green variations
|
||||||
|
bgColor.includes('rgb(40, 167, 69)'); // bootstrap success
|
||||||
|
});
|
||||||
|
|
||||||
|
return positiveButton || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
hasPrivacyPolicyLink(element) {
|
||||||
|
const links = element.querySelectorAll('a');
|
||||||
|
return Array.from(links).some(link =>
|
||||||
|
link.textContent.toLowerCase().includes('privacy') ||
|
||||||
|
link.href.toLowerCase().includes('privacy')
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
hasManagePreferencesButton(element) {
|
||||||
|
const buttons = this.findButtons(element);
|
||||||
|
return buttons.some(button => {
|
||||||
|
const text = button.textContent.toLowerCase();
|
||||||
|
return text.includes('manage') || text.includes('preferences') ||
|
||||||
|
text.includes('settings') || text.includes('customize');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
hasEmailInput(element) {
|
||||||
|
const inputs = element.querySelectorAll('input[type="email"], input[placeholder*="email" i]');
|
||||||
|
return inputs.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
hasDiscountMention(textContent) {
|
||||||
|
const discountTerms = ['discount', 'off', '%', 'save', 'deal', 'offer'];
|
||||||
|
return discountTerms.some(term => textContent.includes(term));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
var agreedToPopover = false;
|
||||||
|
// Usage example:
|
||||||
|
function handlePopover(popoverElement) {
|
||||||
|
const handler = new CookieConsentHandler();
|
||||||
|
const result = handler.analyzePopover(popoverElement);
|
||||||
|
|
||||||
|
console.log('Popover analysis result:', result);
|
||||||
|
|
||||||
|
switch (result.category) {
|
||||||
|
case 'cookie_consent':
|
||||||
|
console.log('Detected cookie consent popover');
|
||||||
|
if (result.action === 'clicked_accept') {
|
||||||
|
console.log('Successfully accepted cookies');
|
||||||
|
agreedToPopover = true;
|
||||||
|
} else {
|
||||||
|
console.log('Could not accept cookies:', result.reason);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'newsletter':
|
||||||
|
console.log('Detected newsletter popover - no action taken');
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
console.log('Unknown popover type - no action taken');
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function finalizeMarginaliaHack() {
|
||||||
|
addStylesAsDataAttributes();
|
||||||
|
|
||||||
|
// Find all likely popover elements
|
||||||
|
const fixedElements = document.querySelectorAll('[data-position="fixed"]');
|
||||||
|
|
||||||
|
// Attempt to agree to cookie consent popups
|
||||||
|
fixedElements.forEach(element => {
|
||||||
|
handlePopover(element);
|
||||||
|
});
|
||||||
|
|
||||||
|
// If we found a popover and agreed to it, add a notice
|
||||||
|
if (agreedToPopover) {
|
||||||
|
var notice = document.createElement('div');
|
||||||
|
notice.setAttribute('class', 'marginalia-agreed-cookies');
|
||||||
|
networkRequests.appendChild(notice);
|
||||||
|
}
|
||||||
|
|
||||||
|
var finalize = () => {
|
||||||
|
// Add a container for network requests
|
||||||
|
document.body.appendChild(networkRequests);
|
||||||
|
document.body.setAttribute('id', 'marginaliahack');
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have a popover and agreed to it, wait a bit before finalizing
|
||||||
|
// to let the ad networks load so we can capture their requests
|
||||||
|
if (agreedToPopover) {
|
||||||
|
setTimeout(finalize, 2500);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
finalize();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class EventSimulator {
|
||||||
|
constructor() {}
|
||||||
|
|
||||||
|
// Simulate smooth scrolling down the page
|
||||||
|
simulateScrollDown(duration = 2000, distance = null) {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const startScrollY = window.scrollY;
|
||||||
|
const maxScroll = document.documentElement.scrollHeight - window.innerHeight;
|
||||||
|
const targetDistance = distance || Math.min(window.innerHeight * 3, maxScroll - startScrollY);
|
||||||
|
|
||||||
|
if (targetDistance <= 0) {
|
||||||
|
resolve();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const animate = () => {
|
||||||
|
const elapsed = Date.now() - startTime;
|
||||||
|
const progress = Math.min(elapsed / duration, 1);
|
||||||
|
|
||||||
|
// Ease-out function for smooth scrolling
|
||||||
|
const easeOut = 1 - Math.pow(1 - progress, 3);
|
||||||
|
const currentDistance = targetDistance * easeOut;
|
||||||
|
const newScrollY = startScrollY + currentDistance;
|
||||||
|
|
||||||
|
// Dispatch scroll events as we go
|
||||||
|
window.scrollTo(0, newScrollY);
|
||||||
|
|
||||||
|
// Fire custom scroll event
|
||||||
|
const scrollEvent = new Event('scroll', {
|
||||||
|
bubbles: true,
|
||||||
|
cancelable: true
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add custom properties to track simulation
|
||||||
|
scrollEvent.simulated = true;
|
||||||
|
scrollEvent.scrollY = newScrollY;
|
||||||
|
scrollEvent.progress = progress;
|
||||||
|
|
||||||
|
window.dispatchEvent(scrollEvent);
|
||||||
|
document.dispatchEvent(scrollEvent);
|
||||||
|
|
||||||
|
if (progress < 1) {
|
||||||
|
requestAnimationFrame(animate);
|
||||||
|
} else {
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
requestAnimationFrame(animate);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate mouse movement toward URL bar
|
||||||
|
simulateMouseToURLBar(duration = 1500) {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
// Get current mouse position (or start from center of viewport)
|
||||||
|
const startX = window.innerWidth / 2;
|
||||||
|
const startY = window.innerHeight / 2;
|
||||||
|
|
||||||
|
// URL bar is typically at the top center of the browser
|
||||||
|
// Since we can't access actual browser chrome, we'll simulate movement
|
||||||
|
// toward the top of the viewport where the URL bar would be
|
||||||
|
const targetX = window.innerWidth / 2; // Center horizontally
|
||||||
|
const targetY = -50; // Above the viewport (simulating URL bar position)
|
||||||
|
|
||||||
|
const deltaX = targetX - startX;
|
||||||
|
const deltaY = targetY - startY;
|
||||||
|
|
||||||
|
let lastMouseEvent = null;
|
||||||
|
|
||||||
|
const animate = () => {
|
||||||
|
const elapsed = Date.now() - startTime;
|
||||||
|
const progress = Math.min(elapsed / duration, 1);
|
||||||
|
|
||||||
|
// Ease-in-out function for natural mouse movement
|
||||||
|
const easeInOut = progress < 0.5
|
||||||
|
? 2 * progress * progress
|
||||||
|
: 1 - Math.pow(-2 * progress + 2, 3) / 2;
|
||||||
|
|
||||||
|
const currentX = startX + (deltaX * easeInOut);
|
||||||
|
const currentY = startY + (deltaY * easeInOut);
|
||||||
|
|
||||||
|
// Create mouse move event
|
||||||
|
const mouseMoveEvent = new MouseEvent('mousemove', {
|
||||||
|
bubbles: true,
|
||||||
|
cancelable: true,
|
||||||
|
clientX: currentX,
|
||||||
|
clientY: currentY,
|
||||||
|
screenX: currentX,
|
||||||
|
screenY: currentY,
|
||||||
|
movementX: lastMouseEvent ? currentX - lastMouseEvent.clientX : 0,
|
||||||
|
movementY: lastMouseEvent ? currentY - lastMouseEvent.clientY : 0,
|
||||||
|
buttons: 0,
|
||||||
|
button: -1
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add custom properties to track simulation
|
||||||
|
mouseMoveEvent.simulated = true;
|
||||||
|
mouseMoveEvent.progress = progress;
|
||||||
|
mouseMoveEvent.targetType = 'urlbar';
|
||||||
|
|
||||||
|
// Find element under mouse and dispatch event
|
||||||
|
const elementUnderMouse = document.elementFromPoint(currentX, currentY);
|
||||||
|
if (elementUnderMouse) {
|
||||||
|
elementUnderMouse.dispatchEvent(mouseMoveEvent);
|
||||||
|
|
||||||
|
// Also fire mouseenter/mouseleave events if element changed
|
||||||
|
if (lastMouseEvent) {
|
||||||
|
const lastElement = document.elementFromPoint(
|
||||||
|
lastMouseEvent.clientX,
|
||||||
|
lastMouseEvent.clientY
|
||||||
|
);
|
||||||
|
|
||||||
|
if (lastElement && lastElement !== elementUnderMouse) {
|
||||||
|
// Mouse left previous element
|
||||||
|
const mouseLeaveEvent = new MouseEvent('mouseleave', {
|
||||||
|
bubbles: false, // mouseleave doesn't bubble
|
||||||
|
cancelable: true,
|
||||||
|
clientX: currentX,
|
||||||
|
clientY: currentY,
|
||||||
|
relatedTarget: elementUnderMouse
|
||||||
|
});
|
||||||
|
mouseLeaveEvent.simulated = true;
|
||||||
|
lastElement.dispatchEvent(mouseLeaveEvent);
|
||||||
|
|
||||||
|
// Mouse entered new element
|
||||||
|
const mouseEnterEvent = new MouseEvent('mouseenter', {
|
||||||
|
bubbles: false, // mouseenter doesn't bubble
|
||||||
|
cancelable: true,
|
||||||
|
clientX: currentX,
|
||||||
|
clientY: currentY,
|
||||||
|
relatedTarget: lastElement
|
||||||
|
});
|
||||||
|
mouseEnterEvent.simulated = true;
|
||||||
|
elementUnderMouse.dispatchEvent(mouseEnterEvent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also dispatch on document and window
|
||||||
|
document.dispatchEvent(mouseMoveEvent);
|
||||||
|
window.dispatchEvent(mouseMoveEvent);
|
||||||
|
|
||||||
|
lastMouseEvent = mouseMoveEvent;
|
||||||
|
|
||||||
|
if (progress < 1) {
|
||||||
|
requestAnimationFrame(animate);
|
||||||
|
} else {
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
requestAnimationFrame(animate);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate realistic mouse movement with slight randomness
|
||||||
|
simulateNaturalMouseMovement(targetX, targetY, duration = 1000) {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const startX = window.innerWidth / 2;
|
||||||
|
const startY = window.innerHeight / 2;
|
||||||
|
|
||||||
|
const basePathX = targetX - startX;
|
||||||
|
const basePathY = targetY - startY;
|
||||||
|
|
||||||
|
const animate = () => {
|
||||||
|
const elapsed = Date.now() - startTime;
|
||||||
|
const progress = Math.min(elapsed / duration, 1);
|
||||||
|
|
||||||
|
// Add some randomness to make movement more natural
|
||||||
|
const randomOffsetX = (Math.random() - 0.5) * 10 * (1 - progress);
|
||||||
|
const randomOffsetY = (Math.random() - 0.5) * 10 * (1 - progress);
|
||||||
|
|
||||||
|
// Bezier curve for more natural movement
|
||||||
|
const t = progress;
|
||||||
|
const bezierProgress = t * t * (3.0 - 2.0 * t);
|
||||||
|
|
||||||
|
const currentX = startX + (basePathX * bezierProgress) + randomOffsetX;
|
||||||
|
const currentY = startY + (basePathY * bezierProgress) + randomOffsetY;
|
||||||
|
|
||||||
|
const mouseMoveEvent = new MouseEvent('mousemove', {
|
||||||
|
bubbles: true,
|
||||||
|
cancelable: true,
|
||||||
|
clientX: currentX,
|
||||||
|
clientY: currentY,
|
||||||
|
screenX: currentX,
|
||||||
|
screenY: currentY
|
||||||
|
});
|
||||||
|
|
||||||
|
mouseMoveEvent.simulated = true;
|
||||||
|
mouseMoveEvent.natural = true;
|
||||||
|
|
||||||
|
document.dispatchEvent(mouseMoveEvent);
|
||||||
|
|
||||||
|
if (progress < 1) {
|
||||||
|
requestAnimationFrame(animate);
|
||||||
|
} else {
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
requestAnimationFrame(animate);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Combined simulation: scroll down while moving mouse toward URL bar
|
||||||
|
async simulateBrowsingBehavior() {
|
||||||
|
|
||||||
|
// Start both animations simultaneously
|
||||||
|
const scrollPromise = this.simulateScrollDown(300);
|
||||||
|
const mousePromise = this.simulateMouseToURLBar(200);
|
||||||
|
|
||||||
|
// Wait for both to complete
|
||||||
|
await Promise.all([scrollPromise, mousePromise]);
|
||||||
|
|
||||||
|
// Add a small pause
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
|
|
||||||
|
// Simulate some additional natural mouse movement
|
||||||
|
await this.simulateNaturalMouseMovement(
|
||||||
|
window.innerWidth * 0.3,
|
||||||
|
window.innerHeight * 0.1,
|
||||||
|
100
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log('Browsing behavior simulation completed');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Usage examples:
|
||||||
|
const simulator = new EventSimulator();
|
||||||
|
|
||||||
|
function simulateUserBehavior() {
|
||||||
|
simulator.simulateBrowsingBehavior().then(() => {
|
||||||
|
console.log('User behavior simulation finished');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
window.addEventListener("load", (e) => simulateUserBehavior());
|
||||||
|
window.addEventListener("load", (e) => setTimeout(finalizeMarginaliaHack, 2000));
|
29
code/tools/browserless/extension/manifest.json
Normal file
29
code/tools/browserless/extension/manifest.json
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
{
|
||||||
|
"manifest_version": 3,
|
||||||
|
"name": "Marginalia DOM Interceptor",
|
||||||
|
"version": "1.0",
|
||||||
|
"description": "Makes DOM export better",
|
||||||
|
|
||||||
|
"permissions": [
|
||||||
|
"activeTab",
|
||||||
|
"scripting",
|
||||||
|
"webNavigation",
|
||||||
|
"webRequest"
|
||||||
|
],
|
||||||
|
"host_permissions": [
|
||||||
|
"<all_urls>"
|
||||||
|
],
|
||||||
|
"background": {
|
||||||
|
"service_worker": "background.js",
|
||||||
|
"type": "module"
|
||||||
|
},
|
||||||
|
"content_scripts": [
|
||||||
|
{
|
||||||
|
"js": ["content.js"],
|
||||||
|
"run_at": "document_start",
|
||||||
|
"matches": [
|
||||||
|
"<all_urls>"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
@@ -7,3 +7,6 @@
|
|||||||
2025-05-04: Deploy qs, search and api-services.
|
2025-05-04: Deploy qs, search and api-services.
|
||||||
2025-05-05: Deploy executor partition 4.
|
2025-05-05: Deploy executor partition 4.
|
||||||
2025-05-05: Deploy control.
|
2025-05-05: Deploy control.
|
||||||
|
2025-05-08: Deploy assistant.
|
||||||
|
2025-05-17: Redeploy all.
|
||||||
|
2025-05-28: Deploy assistant and browserless.
|
@@ -93,6 +93,7 @@ include 'code:tools:experiment-runner'
|
|||||||
include 'code:tools:screenshot-capture-tool'
|
include 'code:tools:screenshot-capture-tool'
|
||||||
include 'code:tools:load-test'
|
include 'code:tools:load-test'
|
||||||
include 'code:tools:integration-test'
|
include 'code:tools:integration-test'
|
||||||
|
include 'code:tools:browserless'
|
||||||
|
|
||||||
include 'third-party:porterstemmer'
|
include 'third-party:porterstemmer'
|
||||||
include 'third-party:symspell'
|
include 'third-party:symspell'
|
||||||
@@ -236,7 +237,7 @@ dependencyResolutionManagement {
|
|||||||
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
||||||
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
||||||
|
|
||||||
library('slop', 'nu.marginalia', 'slop').version('0.0.10-SNAPSHOT')
|
library('slop', 'nu.marginalia', 'slop').version('0.0.11-SNAPSHOT')
|
||||||
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
||||||
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
||||||
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
||||||
@@ -244,6 +245,7 @@ dependencyResolutionManagement {
|
|||||||
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
|
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
|
||||||
library('jte','gg.jte','jte').version('3.1.15')
|
library('jte','gg.jte','jte').version('3.1.15')
|
||||||
|
|
||||||
|
library('pdfbox', 'org.apache.pdfbox', 'pdfbox').version('3.0.5')
|
||||||
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
||||||
|
|
||||||
bundle('slf4j', ['slf4j.api', 'log4j.api', 'log4j.core', 'log4j.slf4j'])
|
bundle('slf4j', ['slf4j.api', 'log4j.api', 'log4j.core', 'log4j.slf4j'])
|
||||||
|
@@ -272,6 +272,13 @@ if __name__ == '__main__':
|
|||||||
deploy_tier=1,
|
deploy_tier=1,
|
||||||
groups={"all", "core"}
|
groups={"all", "core"}
|
||||||
),
|
),
|
||||||
|
'browserless': ServiceConfig(
|
||||||
|
gradle_target=':code:tools:browserless:docker',
|
||||||
|
docker_name='browserless',
|
||||||
|
instances=None,
|
||||||
|
deploy_tier=2,
|
||||||
|
groups={"all", "core"}
|
||||||
|
),
|
||||||
'assistant': ServiceConfig(
|
'assistant': ServiceConfig(
|
||||||
gradle_target=':code:services-core:assistant-service:docker',
|
gradle_target=':code:services-core:assistant-service:docker',
|
||||||
docker_name='assistant-service',
|
docker_name='assistant-service',
|
||||||
|
Reference in New Issue
Block a user