mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
153 Commits
deploy-015
...
deploy-023
Author | SHA1 | Date | |
---|---|---|---|
|
13fb1efce4 | ||
|
c1225165b7 | ||
|
67ad7a3bbc | ||
|
ed62ec8a35 | ||
|
42b24cfa34 | ||
|
1ffaab2da6 | ||
|
5f93c7f767 | ||
|
4001c68c82 | ||
|
6b811489c5 | ||
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 | ||
|
9afed0a18e | ||
|
afad4deb94 | ||
|
f071c947e4 | ||
|
79996c9348 | ||
|
db907ab06a | ||
|
c49cd9dd95 | ||
|
eec9df3b0a | ||
|
e5f3288de6 | ||
|
d587544d3a | ||
|
1a9ae1bc40 | ||
|
e0c81e956a | ||
|
542fb12b38 | ||
|
65ec734566 | ||
|
10b6a25c63 | ||
|
6260f6bec7 | ||
|
d6d5467696 | ||
|
034560ca75 | ||
|
e994fddae4 | ||
|
345f01f306 | ||
|
5a8e286689 | ||
|
39a055aa94 | ||
|
37aaa90dc9 | ||
|
24022c5adc | ||
|
1de9ecc0b6 | ||
|
9b80245ea0 | ||
|
4e1595c1a6 | ||
|
0be8585fa5 | ||
|
a0fe070fe7 | ||
|
abe9da0fc6 | ||
|
56d0128b0a | ||
|
840b68ac55 | ||
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
be039d1a8c | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 | ||
|
599534806b | ||
|
7e8253dac7 | ||
|
97a6780ea3 | ||
|
eb634beec8 | ||
|
269ebd1654 | ||
|
39ce40bfeb | ||
|
c187b2e1c1 | ||
|
42eaa4588b | ||
|
4f40a5fbeb | ||
|
3f3d42bc01 | ||
|
61c8d53e1b | ||
|
a7a3d85be9 | ||
|
306232fb54 | ||
|
5aef844f0d | ||
|
d56b5c828a | ||
|
ab58a4636f | ||
|
00be269238 | ||
|
879e6a9424 | ||
|
fba3455732 | ||
|
14283da7f5 | ||
|
93df4d1fc0 | ||
|
b12a0b998c | ||
|
3b6f4e321b | ||
|
8428111771 | ||
|
e9fd4415ef | ||
|
4c95c3dcad | ||
|
c5281536fb | ||
|
4431dae7ac | ||
|
4df4d0a7a8 | ||
|
9f05083b94 | ||
|
fc92e9b9c0 | ||
|
328fb5d927 | ||
|
36889950e8 | ||
|
c96a94878b | ||
|
1c57d7d73a | ||
|
a443d22356 | ||
|
aa59d4afa4 | ||
|
df0f18d0e7 | ||
|
0819d46f97 | ||
|
5e2b63473e | ||
|
f9590703f1 | ||
|
f12fc11337 | ||
|
c309030184 | ||
|
fd5af01629 | ||
|
d4c43c7a79 | ||
|
18700e1919 | ||
|
120b431998 | ||
|
71dad99326 | ||
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 | ||
|
413d5cc788 | ||
|
58539b92ac | ||
|
fe72f16df1 | ||
|
b49a244a2e | ||
|
3f0b4c010f | ||
|
c6e0cd93f7 | ||
|
80a7ccb080 | ||
|
54dec347c4 | ||
|
d6ee3f0785 | ||
|
8be88afcf3 | ||
|
0e3c00d3e1 | ||
|
4279a7f1aa | ||
|
251006d4f9 | ||
|
c3e99dc12a | ||
|
aaaa2de022 | ||
|
fc1388422a | ||
|
b07080db16 | ||
|
e9d86dca4a |
16
ROADMAP.md
16
ROADMAP.md
@@ -38,14 +38,6 @@ associated with each language added, at least a models file or two, as well as s
|
||||
|
||||
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
||||
|
||||
## Support for binary formats like PDF
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Custom ranking logic
|
||||
|
||||
Stract does an interesting thing where they have configurable search filters.
|
||||
@@ -66,6 +58,14 @@ One of the search engine's biggest limitations right now is that it does not ind
|
||||
|
||||
# Completed
|
||||
|
||||
## Support for binary formats like PDF (COMPLETED 2025-05)
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
@@ -1,3 +1,8 @@
|
||||
package nu.marginalia;
|
||||
|
||||
/**
|
||||
* A record representing a User Agent.
|
||||
* @param uaString - the header value of the User Agent
|
||||
* @param uaIdentifier - what we look for in robots.txt
|
||||
*/
|
||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||
|
@@ -0,0 +1,5 @@
|
||||
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
|
||||
ID INT NOT NULL AUTO_INCREMENT,
|
||||
TIER INT NOT NULL,
|
||||
PRIMARY KEY (ID)
|
||||
);
|
@@ -0,0 +1,213 @@
|
||||
|
||||
-- Create metadata tables for domain ping status and security information
|
||||
|
||||
-- These are not ICMP pings, but rather HTTP(S) pings to check the availability and security
|
||||
-- of web servers associated with domains, to assess uptime and changes in security configurations
|
||||
-- indicating ownership changes or security issues.
|
||||
|
||||
-- Note: DOMAIN_ID and NODE_ID are used to identify the domain and the node that performed the ping.
|
||||
-- These are strictly speaking foreign keys to the EC_DOMAIN table, but as it
|
||||
-- is strictly append-only, we do not need to enforce foreign key constraints.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
SERVER_AVAILABLE BOOLEAN NOT NULL, -- Indicates if the server is available (true) or not (false)
|
||||
SERVER_IP VARBINARY(16), -- IP address of the server (IPv4 or IPv6)
|
||||
SERVER_IP_ASN INTEGER, -- Autonomous System number
|
||||
|
||||
DATA_HASH BIGINT, -- Hash of the data for integrity checks
|
||||
SECURITY_CONFIG_HASH BIGINT, -- Hash of the security configuration for integrity checks
|
||||
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_ETAG VARCHAR(255), -- ETag of the resource as per HTTP headers
|
||||
HTTP_LAST_MODIFIED VARCHAR(255), -- Last modified date of the resource as per HTTP headers
|
||||
HTTP_STATUS INT, -- HTTP status code (e.g., 200, 404, etc.)
|
||||
HTTP_LOCATION VARCHAR(255), -- If the server redirects, this is the location of the redirect
|
||||
HTTP_RESPONSE_TIME_MS SMALLINT UNSIGNED, -- Response time in milliseconds
|
||||
|
||||
ERROR_CLASSIFICATION ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN'), -- Classification of the error if the server is not available
|
||||
ERROR_MESSAGE VARCHAR(255), -- Error message if the server is not available
|
||||
|
||||
TS_LAST_PING TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the last ping
|
||||
TS_LAST_AVAILABLE TIMESTAMP, -- Timestamp of the last time the server was available
|
||||
TS_LAST_ERROR TIMESTAMP, -- Timestamp of the last error encountered
|
||||
|
||||
NEXT_SCHEDULED_UPDATE TIMESTAMP NOT NULL,
|
||||
BACKOFF_CONSECUTIVE_FAILURES INT NOT NULL DEFAULT 0, -- Number of consecutive failures to ping the server
|
||||
BACKOFF_FETCH_INTERVAL INT NOT NULL DEFAULT 60 -- Interval in seconds for the next scheduled ping
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NEXT_SCHEDULED_UPDATE_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, NEXT_SCHEDULED_UPDATE);
|
||||
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
ASN INTEGER, -- Autonomous System Number (ASN) of the server
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_VERSION VARCHAR(10), -- HTTP version used (e.g., HTTP/1.1, HTTP/2)
|
||||
HTTP_COMPRESSION VARCHAR(50), -- Compression method used (e.g., gzip, deflate, br)
|
||||
HTTP_CACHE_CONTROL TEXT, -- Cache control directives from HTTP headers
|
||||
|
||||
SSL_CERT_NOT_BEFORE TIMESTAMP, -- Valid from date (usually same as issued)
|
||||
SSL_CERT_NOT_AFTER TIMESTAMP, -- Valid until date (usually same as expires)
|
||||
|
||||
SSL_CERT_ISSUER VARCHAR(255), -- CA that issued the cert
|
||||
SSL_CERT_SUBJECT VARCHAR(255), -- Certificate subject/CN
|
||||
|
||||
SSL_CERT_PUBLIC_KEY_HASH BINARY(32), -- SHA-256 hash of the public key
|
||||
SSL_CERT_SERIAL_NUMBER VARCHAR(100), -- Unique cert serial number
|
||||
SSL_CERT_FINGERPRINT_SHA256 BINARY(32), -- SHA-256 fingerprint for exact identification
|
||||
SSL_CERT_SAN TEXT, -- Subject Alternative Names (JSON array)
|
||||
SSL_CERT_WILDCARD BOOLEAN, -- Wildcard certificate (*.example.com)
|
||||
|
||||
SSL_PROTOCOL VARCHAR(20), -- TLS 1.2, TLS 1.3, etc.
|
||||
SSL_CIPHER_SUITE VARCHAR(100), -- e.g., TLS_AES_256_GCM_SHA384
|
||||
SSL_KEY_EXCHANGE VARCHAR(50), -- ECDHE, RSA, etc.
|
||||
SSL_CERTIFICATE_CHAIN_LENGTH TINYINT, -- Number of certs in chain
|
||||
|
||||
SSL_CERTIFICATE_VALID BOOLEAN, -- Valid cert chain
|
||||
|
||||
HEADER_CORS_ALLOW_ORIGIN TEXT, -- Could be *, specific domains, or null
|
||||
HEADER_CORS_ALLOW_CREDENTIALS BOOLEAN, -- Credential handling
|
||||
HEADER_CONTENT_SECURITY_POLICY_HASH INT, -- CSP header, hash of the policy
|
||||
HEADER_STRICT_TRANSPORT_SECURITY VARCHAR(255), -- HSTS header
|
||||
HEADER_REFERRER_POLICY VARCHAR(50), -- Referrer handling
|
||||
HEADER_X_FRAME_OPTIONS VARCHAR(50), -- Clickjacking protection
|
||||
HEADER_X_CONTENT_TYPE_OPTIONS VARCHAR(50), -- MIME sniffing protection
|
||||
HEADER_X_XSS_PROTECTION VARCHAR(50), -- XSS protection header
|
||||
|
||||
HEADER_SERVER VARCHAR(255), -- Server header (e.g., Apache, Nginx, etc.)
|
||||
HEADER_X_POWERED_BY VARCHAR(255), -- X-Powered-By header (if present)
|
||||
|
||||
TS_LAST_UPDATE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -- Timestamp of the last SSL check
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_EVENTS (
|
||||
CHANGE_ID BIGINT AUTO_INCREMENT PRIMARY KEY, -- Unique identifier for the change
|
||||
DOMAIN_ID INT NOT NULL, -- Domain ID, used as a foreign key to EC_DOMAIN
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the change
|
||||
|
||||
CHANGE_ASN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to ASN (Autonomous System Number)
|
||||
CHANGE_CERTIFICATE_FINGERPRINT BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate fingerprint
|
||||
CHANGE_CERTIFICATE_PROFILE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate profile (e.g., algorithm, exchange)
|
||||
CHANGE_CERTIFICATE_SAN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate SAN (Subject Alternative Name)
|
||||
CHANGE_CERTIFICATE_PUBLIC_KEY BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate public key
|
||||
CHANGE_SECURITY_HEADERS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to security headers
|
||||
CHANGE_IP_ADDRESS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to IP address
|
||||
CHANGE_SOFTWARE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to the generator (e.g., web server software)
|
||||
OLD_CERT_TIME_TO_EXPIRY INT, -- Time to expiry of the old certificate in hours, if applicable
|
||||
|
||||
SECURITY_SIGNATURE_BEFORE BLOB NOT NULL, -- Security signature before the change, gzipped json record
|
||||
SECURITY_SIGNATURE_AFTER BLOB NOT NULL -- Security signature after the change, gzipped json record
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_EVENTS (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_SECURITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_EVENTS (
|
||||
DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
AVAILABLE BOOLEAN NOT NULL, -- True if the service is available, false if it is not
|
||||
OUTAGE_TYPE ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN') NOT NULL,
|
||||
HTTP_STATUS_CODE INT, -- HTTP status code if available (e.g., 200, 404, etc.)
|
||||
ERROR_MESSAGE VARCHAR(255), -- Specific error details
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- Timestamp of the last update
|
||||
|
||||
AVAILABILITY_RECORD_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (AVAILABILITY_RECORD_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__DOMAIN_ID_TS_IDX ON DOMAIN_AVAILABILITY_EVENTS (DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_AVAILABILITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_INFORMATION (
|
||||
DNS_ROOT_DOMAIN_ID INT AUTO_INCREMENT PRIMARY KEY,
|
||||
ROOT_DOMAIN_NAME VARCHAR(255) NOT NULL UNIQUE,
|
||||
NODE_AFFINITY INT NOT NULL, -- Node ID that performs the DNS check, assign randomly across nodes
|
||||
|
||||
DNS_A_RECORDS TEXT, -- JSON array of IPv4 addresses
|
||||
DNS_AAAA_RECORDS TEXT, -- JSON array of IPv6 addresses
|
||||
DNS_CNAME_RECORD VARCHAR(255), -- Canonical name (if applicable)
|
||||
DNS_MX_RECORDS TEXT, -- JSON array of mail exchange records
|
||||
DNS_CAA_RECORDS TEXT, -- Certificate Authority Authorization
|
||||
DNS_TXT_RECORDS TEXT, -- TXT records (SPF, DKIM, verification, etc.)
|
||||
DNS_NS_RECORDS TEXT, -- Name servers (JSON array)
|
||||
DNS_SOA_RECORD TEXT, -- Start of Authority (JSON object)
|
||||
|
||||
TS_LAST_DNS_CHECK TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
TS_NEXT_DNS_CHECK TIMESTAMP NOT NULL,
|
||||
DNS_CHECK_PRIORITY TINYINT DEFAULT 0 -- Priority of the DNS check, in case we want to schedule a refresh sooner
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_INFORMATION__PRIORITY_NEXT_CHECK_IDX ON DOMAIN_DNS_INFORMATION (NODE_AFFINITY, DNS_CHECK_PRIORITY DESC, TS_NEXT_DNS_CHECK);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_EVENTS (
|
||||
DNS_ROOT_DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- DNS change type flags
|
||||
CHANGE_A_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv4 address changes
|
||||
CHANGE_AAAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv6 address changes
|
||||
CHANGE_CNAME BOOLEAN NOT NULL DEFAULT FALSE, -- CNAME changes
|
||||
CHANGE_MX_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Mail server changes
|
||||
CHANGE_CAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Certificate authority changes
|
||||
CHANGE_TXT_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- TXT record changes (SPF, DKIM, etc.)
|
||||
CHANGE_NS_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Name server changes (big red flag!)
|
||||
CHANGE_SOA_RECORD BOOLEAN NOT NULL DEFAULT FALSE, -- Start of Authority changes
|
||||
|
||||
DNS_SIGNATURE_BEFORE BLOB NOT NULL, -- Compressed JSON snapshot of DNS records before change
|
||||
DNS_SIGNATURE_AFTER BLOB NOT NULL, -- Compressed JSON snapshot of DNS records after change
|
||||
|
||||
DNS_EVENT_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (DNS_EVENT_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__DNS_ROOT_DOMAIN_ID_TS_IDX ON DOMAIN_DNS_EVENTS (DNS_ROOT_DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__TS_CHANGE_IDX ON DOMAIN_DNS_EVENTS (TS_CHANGE);
|
@@ -0,0 +1,6 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_SERIAL_NUMBER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_ISSUER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,24 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
public enum DocumentFormat {
|
||||
PLAIN(0, 1, "text"),
|
||||
PDF(0, 1, "pdf"),
|
||||
UNKNOWN(0, 1, "???"),
|
||||
HTML123(0, 1, "html"),
|
||||
HTML4(-0.1, 1.05, "html"),
|
||||
XHTML(-0.1, 1.05, "html"),
|
||||
HTML5(0.5, 1.1, "html");
|
||||
|
||||
/** Used to tune quality score */
|
||||
public final double offset;
|
||||
/** Used to tune quality score */
|
||||
public final double scale;
|
||||
public final String shortFormat;
|
||||
|
||||
DocumentFormat(double offset, double scale, String shortFormat) {
|
||||
this.offset = offset;
|
||||
this.scale = scale;
|
||||
this.shortFormat = shortFormat;
|
||||
}
|
||||
|
||||
}
|
@@ -112,14 +112,6 @@ public class EdgeDomain implements Serializable {
|
||||
return topDomain;
|
||||
}
|
||||
|
||||
public String getDomainKey() {
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
return topDomain;
|
||||
}
|
||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||
}
|
||||
|
||||
/** If possible, try to provide an alias domain,
|
||||
* i.e. a domain name that is very likely to link to this one
|
||||
* */
|
||||
|
@@ -1,16 +1,14 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.util.QueryParams;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.Serializable;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class EdgeUrl implements Serializable {
|
||||
public final String proto;
|
||||
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
private static URI parseURI(String url) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(urlencodeFixer(url));
|
||||
return EdgeUriFactory.parseURILenient(url);
|
||||
} catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||
}
|
||||
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
||||
|
||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
||||
|
||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
||||
and what you get is more like what's on the inside, we try to patch things instead,
|
||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
||||
like bad or missing URLEncoding
|
||||
*/
|
||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
||||
var s = new StringBuilder();
|
||||
String goodChars = "&.?:/-;+$#";
|
||||
String hexChars = "0123456789abcdefABCDEF";
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return url + "/";
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
for (int i = pathIdx; i < end; i++) {
|
||||
int c = url.charAt(i);
|
||||
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
s.appendCodePoint(c);
|
||||
} else if (c == '%' && i + 2 < end) {
|
||||
int cn = url.charAt(i + 1);
|
||||
int cnn = url.charAt(i + 2);
|
||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||
s.appendCodePoint(c);
|
||||
} else {
|
||||
s.append("%25");
|
||||
}
|
||||
} else {
|
||||
s.append(String.format("%%%02X", c));
|
||||
}
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
private static int findPathIdx(String url) throws URISyntaxException {
|
||||
int colonIdx = url.indexOf(':');
|
||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking protocol");
|
||||
}
|
||||
return url.indexOf('/', colonIdx + 2);
|
||||
}
|
||||
|
||||
public EdgeUrl(URI URI) {
|
||||
try {
|
||||
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
EdgeUriFactory.urlencodePath(sb, path);
|
||||
|
||||
if (param != null) {
|
||||
EdgeUriFactory.urlencodeQuery(sb, param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public String toDisplayString() {
|
||||
StringBuilder sb = new StringBuilder(256);
|
||||
|
||||
sb.append(proto);
|
||||
sb.append("://");
|
||||
sb.append(domain);
|
||||
|
||||
if (port != null) {
|
||||
sb.append(':');
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
sb.append(path);
|
||||
|
||||
if (param != null) {
|
||||
sb.append('?');
|
||||
sb.append(param);
|
||||
sb.append('?').append(param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class EdgeUriFactory {
|
||||
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||
|
||||
if (shouldOmitUrlencodeRepair(url)) {
|
||||
try {
|
||||
return new URI(url);
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
// ignore and run the lenient parser
|
||||
}
|
||||
}
|
||||
|
||||
var s = new StringBuilder(url.length()+8);
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return new URI(url + "/");
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
int queryIdx = url.indexOf('?');
|
||||
if (queryIdx < 0) queryIdx = end;
|
||||
|
||||
urlencodePath(s, url.substring(pathIdx, queryIdx));
|
||||
if (queryIdx < end) {
|
||||
urlencodeQuery(s, url.substring(queryIdx + 1, end));
|
||||
}
|
||||
return new URI(s.toString());
|
||||
}
|
||||
|
||||
/** Break apart the path element of an URI into its components, and then
|
||||
* urlencode any component that needs it, and recombine it into a single
|
||||
* path element again.
|
||||
*/
|
||||
public static void urlencodePath(StringBuilder sb, String path) {
|
||||
if (path == null || path.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] pathParts = StringUtils.split(path, '/');
|
||||
if (pathParts.length == 0) {
|
||||
sb.append('/');
|
||||
return;
|
||||
}
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(pathPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
sb.append('/');
|
||||
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||
} else {
|
||||
sb.append('/');
|
||||
sb.append(pathPart);
|
||||
}
|
||||
}
|
||||
|
||||
if (path.endsWith("/")) {
|
||||
sb.append('/');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Break apart the query element of a URI into its components, and then
|
||||
* urlencode any component that needs it, and recombine it into a single
|
||||
* query element again.
|
||||
*/
|
||||
public static void urlencodeQuery(StringBuilder sb, String param) {
|
||||
if (param == null || param.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] queryParts = StringUtils.split(param, '&');
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(queryPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
boolean first = true;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (first) {
|
||||
sb.append('?');
|
||||
first = false;
|
||||
} else {
|
||||
sb.append('&');
|
||||
}
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
int idx = queryPart.indexOf('=');
|
||||
if (idx < 0) {
|
||||
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||
} else {
|
||||
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
|
||||
sb.append('=');
|
||||
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
|
||||
}
|
||||
} else {
|
||||
sb.append(queryPart);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Test if the url element needs URL encoding.
|
||||
* <p></p>
|
||||
* Note we may have been given an already encoded path element,
|
||||
* so we include % and + in the list of good characters
|
||||
*/
|
||||
static boolean needsUrlEncode(String urlElement) {
|
||||
for (int i = 0; i < urlElement.length(); i++) {
|
||||
char c = urlElement.charAt(i);
|
||||
|
||||
if (isUrlSafe(c)) continue;
|
||||
if ("+".indexOf(c) >= 0) continue;
|
||||
if (c == '%' && i + 2 < urlElement.length()) {
|
||||
char c1 = urlElement.charAt(i + 1);
|
||||
char c2 = urlElement.charAt(i + 2);
|
||||
if (isHexDigit(c1) && isHexDigit(c2)) {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static boolean isUrlSafe(int c) {
|
||||
if (c >= 'a' && c <= 'z') return true;
|
||||
if (c >= 'A' && c <= 'Z') return true;
|
||||
if (c >= '0' && c <= '9') return true;
|
||||
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Test if the URL is a valid URL that does not need to be
|
||||
* urlencoded.
|
||||
* <p></p>
|
||||
* This is a very simple heuristic test that does not guarantee
|
||||
* that the URL is valid, but it will identify cases where we
|
||||
* are fairly certain that the URL does not need encoding,
|
||||
* so we can skip a bunch of allocations and string operations
|
||||
* that would otherwise be needed to fix the URL.
|
||||
*/
|
||||
static boolean shouldOmitUrlencodeRepair(String url) {
|
||||
int idx = 0;
|
||||
final int len = url.length();
|
||||
|
||||
// Validate the scheme
|
||||
while (idx < len - 2) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == ':') break;
|
||||
if (!isAsciiAlphabetic(c)) return false;
|
||||
}
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
|
||||
// Validate the authority
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '/') break;
|
||||
if (c == ':') continue;
|
||||
if (c == '@') continue;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
// Validate the path
|
||||
if (idx >= len) return true;
|
||||
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '?') break;
|
||||
if (c == '/') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
if (idx >= len) return true;
|
||||
|
||||
// Validate the query
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '&') continue;
|
||||
if (c == '=') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private static boolean isAsciiAlphabetic(int c) {
|
||||
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
private static boolean isHexDigit(int c) {
|
||||
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
/** Find the index of the path element in a URL.
|
||||
* <p></p>
|
||||
* The path element starts after the scheme and authority part of the URL,
|
||||
* which is everything up to and including the first slash after the colon.
|
||||
*/
|
||||
private static int findPathIdx(String url) throws URISyntaxException {
|
||||
int colonIdx = url.indexOf(':');
|
||||
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking scheme");
|
||||
}
|
||||
return url.indexOf('/', colonIdx + 3);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -28,6 +28,8 @@ public enum HtmlFeature {
|
||||
|
||||
GA_SPAM("special:gaspam"),
|
||||
|
||||
PDF("format:pdf"),
|
||||
|
||||
/** For fingerprinting and ranking */
|
||||
OPENGRAPH("special:opengraph"),
|
||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||
|
@@ -6,11 +6,20 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Instant;
|
||||
|
||||
public class GsonFactory {
|
||||
public static Gson get() {
|
||||
return new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toEpochMilli()))
|
||||
.registerTypeAdapter(Instant.class, (JsonDeserializer<Instant>) (json, typeOfT, context) -> {
|
||||
if (json.isJsonPrimitive() && json.getAsJsonPrimitive().isNumber()) {
|
||||
return Instant.ofEpochMilli(json.getAsLong());
|
||||
} else {
|
||||
throw new JsonParseException("Expected a number for Instant");
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||
|
@@ -1,22 +0,0 @@
|
||||
package nu.marginalia.model.html;
|
||||
|
||||
// This class really doesn't belong anywhere, but will squat here for now
|
||||
public enum HtmlStandard {
|
||||
PLAIN(0, 1),
|
||||
UNKNOWN(0, 1),
|
||||
HTML123(0, 1),
|
||||
HTML4(-0.1, 1.05),
|
||||
XHTML(-0.1, 1.05),
|
||||
HTML5(0.5, 1.1);
|
||||
|
||||
/** Used to tune quality score */
|
||||
public final double offset;
|
||||
/** Used to tune quality score */
|
||||
public final double scale;
|
||||
|
||||
HtmlStandard(double offset, double scale) {
|
||||
this.offset = offset;
|
||||
this.scale = scale;
|
||||
}
|
||||
|
||||
}
|
@@ -9,7 +9,7 @@ public enum DocumentFlags {
|
||||
GeneratorForum,
|
||||
GeneratorWiki,
|
||||
Sideloaded,
|
||||
Unused7,
|
||||
PdfFile,
|
||||
Unused8,
|
||||
;
|
||||
|
||||
|
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class EdgeDomainTest {
|
||||
|
||||
@Test
|
||||
public void testSkepdic() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
|
||||
assertEquals("skepdic", domain.getDomain().getDomainKey());
|
||||
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
|
||||
assertEquals("skepdic", domain2.getDomain().getDomainKey());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHkDomain() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
@@ -21,25 +21,70 @@ class EdgeUrlTest {
|
||||
new EdgeUrl("https://memex.marginalia.nu/#here")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParam() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
}
|
||||
@Test
|
||||
void urlencodeFixer() throws URISyntaxException {
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
||||
void testUriFromString() throws URISyntaxException {
|
||||
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
|
||||
// converting it back to a string, we want to ensure there is no changes along the way.
|
||||
|
||||
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
|
||||
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
|
||||
|
||||
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
|
||||
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
|
||||
|
||||
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
|
||||
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
|
||||
|
||||
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
|
||||
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParms() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
|
||||
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
|
||||
|
||||
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
|
||||
|
||||
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
|
||||
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
|
||||
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
|
||||
}
|
||||
}
|
@@ -59,17 +59,14 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
int lastProgress = this.progress;
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
if (this.progress / 10 != lastProgress / 10) {
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
}
|
||||
|
||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||
@Override
|
||||
|
@@ -0,0 +1,59 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
@Singleton
|
||||
public class ProcessEventLog {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessEventLog.class);
|
||||
|
||||
private final String serviceName;
|
||||
private final UUID instanceUuid;
|
||||
private final String serviceBase;
|
||||
|
||||
@Inject
|
||||
public ProcessEventLog(HikariDataSource dataSource, ProcessConfiguration configuration) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.serviceName = configuration.processName() + ":" + configuration.node();
|
||||
this.instanceUuid = configuration.instanceUuid();
|
||||
this.serviceBase = configuration.processName();
|
||||
|
||||
logger.info("Starting service {} instance {}", serviceName, instanceUuid);
|
||||
|
||||
logEvent("PCS-START", serviceName);
|
||||
}
|
||||
|
||||
public void logEvent(Class<?> type, String message) {
|
||||
logEvent(type.getSimpleName(), message);
|
||||
}
|
||||
public void logEvent(String type, String message) {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, serviceName);
|
||||
stmt.setString(2, serviceBase);
|
||||
stmt.setString(3, instanceUuid.toString());
|
||||
stmt.setString(4, type);
|
||||
stmt.setString(5, Objects.requireNonNull(message, ""));
|
||||
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to log event {}:{}", type, message);
|
||||
}
|
||||
}
|
||||
}
|
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
int lastProgress = this.progress;
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
|
||||
if (this.progress / 10 != lastProgress / 10) {
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
|
@@ -1,17 +1,23 @@
|
||||
package nu.marginalia.service.discovery;
|
||||
|
||||
import nu.marginalia.service.discovery.monitor.*;
|
||||
import com.google.inject.ImplementedBy;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
/** A service registry that allows services to register themselves and
|
||||
* be discovered by other services on the network.
|
||||
*/
|
||||
@ImplementedBy(ZkServiceRegistry.class)
|
||||
public interface ServiceRegistryIf {
|
||||
/**
|
||||
* Register a service with the registry.
|
||||
@@ -57,4 +63,9 @@ public interface ServiceRegistryIf {
|
||||
* </ul>
|
||||
* */
|
||||
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
||||
|
||||
void registerProcess(String processName, int nodeId);
|
||||
void deregisterProcess(String processName, int nodeId);
|
||||
void watchProcess(String processName, int nodeId, Consumer<Boolean> callback) throws Exception;
|
||||
void watchProcessAnyNode(String processName, Collection<Integer> nodes, BiConsumer<Boolean, Integer> callback) throws Exception;
|
||||
}
|
||||
|
@@ -13,11 +13,10 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
@@ -256,6 +255,90 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
.forPath("/running-instances");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void registerProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.EPHEMERAL)
|
||||
.forPath(path);
|
||||
livenessPaths.add(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to register process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deregisterProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.delete().forPath(path);
|
||||
livenessPaths.remove(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to deregister process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void watchProcess(String processName, int nodeId, Consumer<Boolean> callback) throws Exception {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
|
||||
// first check if the path exists and call the callback accordingly
|
||||
|
||||
if (curatorFramework.checkExists().forPath(path) != null) {
|
||||
callback.accept(true);
|
||||
}
|
||||
else {
|
||||
callback.accept(false);
|
||||
}
|
||||
|
||||
curatorFramework.watchers().add()
|
||||
.usingWatcher((Watcher) change -> {
|
||||
Watcher.Event.EventType type = change.getType();
|
||||
|
||||
if (type == Watcher.Event.EventType.NodeCreated) {
|
||||
callback.accept(true);
|
||||
}
|
||||
if (type == Watcher.Event.EventType.NodeDeleted) {
|
||||
callback.accept(false);
|
||||
}
|
||||
})
|
||||
.forPath(path);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void watchProcessAnyNode(String processName, Collection<Integer> nodes, BiConsumer<Boolean, Integer> callback) throws Exception {
|
||||
|
||||
for (int node : nodes) {
|
||||
String path = "/process-locks/" + processName + "/" + node;
|
||||
|
||||
// first check if the path exists and call the callback accordingly
|
||||
if (curatorFramework.checkExists().forPath(path) != null) {
|
||||
callback.accept(true, node);
|
||||
}
|
||||
else {
|
||||
callback.accept(false, node);
|
||||
}
|
||||
|
||||
curatorFramework.watchers().add()
|
||||
.usingWatcher((Watcher) change -> {
|
||||
Watcher.Event.EventType type = change.getType();
|
||||
|
||||
if (type == Watcher.Event.EventType.NodeCreated) {
|
||||
callback.accept(true, node);
|
||||
}
|
||||
if (type == Watcher.Event.EventType.NodeDeleted) {
|
||||
callback.accept(false, node);
|
||||
}
|
||||
})
|
||||
.forPath(path);
|
||||
}
|
||||
}
|
||||
|
||||
/* Exposed for tests */
|
||||
public synchronized void shutDown() {
|
||||
if (stopped)
|
||||
|
@@ -122,6 +122,11 @@ public class JoobyService {
|
||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||
options.setCompressionLevel(1);
|
||||
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -3,11 +3,18 @@
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
||||
@@ -15,6 +22,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
@@ -31,9 +39,11 @@
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
|
@@ -1,13 +1,51 @@
|
||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
@@ -34,9 +72,14 @@
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ConsoleInfo"/>
|
||||
<AppenderRef ref="ConsoleWarn"/>
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
|
@@ -1,15 +1,50 @@
|
||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="ConsoleInfo"/>
|
||||
<AppenderRef ref="ConsoleWarn"/>
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
||||
.forNode(node)
|
||||
.run(RpcExportSampleData.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setSize(size)
|
||||
.setCtFilter(ctFilter)
|
||||
.setName(name)
|
||||
.build());
|
||||
}
|
||||
|
@@ -100,6 +100,7 @@ message RpcExportSampleData {
|
||||
int64 fileStorageId = 1;
|
||||
int32 size = 2;
|
||||
string name = 3;
|
||||
string ctFilter = 4;
|
||||
}
|
||||
message RpcDownloadSampleData {
|
||||
string sampleSet = 1;
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
implementation project(':code:processes:crawling-process')
|
||||
implementation project(':code:processes:live-crawling-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
@@ -37,6 +38,7 @@ dependencies {
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
@@ -6,11 +6,13 @@ import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
@@ -35,7 +37,8 @@ public enum ExecutorActor {
|
||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||
UPDATE_RSS(NodeProfile.REALTIME);
|
||||
UPDATE_RSS(NodeProfile.REALTIME)
|
||||
;
|
||||
|
||||
public String id() {
|
||||
return "fsm:" + name().toLowerCase();
|
||||
|
@@ -49,6 +49,7 @@ public class ExecutorActorControlService {
|
||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||
RestoreBackupActor restoreBackupActor,
|
||||
ConverterMonitorActor converterMonitorFSM,
|
||||
PingMonitorActor pingMonitorActor,
|
||||
CrawlerMonitorActor crawlerMonitorActor,
|
||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||
LoaderMonitorActor loaderMonitor,
|
||||
@@ -68,6 +69,7 @@ public class ExecutorActorControlService {
|
||||
ExecutorActorStateMachines stateMachines,
|
||||
MigrateCrawlDataActor migrateCrawlDataActor,
|
||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||
UpdateNsfwFiltersActor updateNsfwFiltersActor,
|
||||
UpdateRssActor updateRssActor) throws SQLException {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.eventLog = baseServiceParams.eventLog;
|
||||
@@ -88,6 +90,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||
|
||||
@@ -109,6 +112,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
|
@@ -0,0 +1,186 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.actor.state.Terminal;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ping.PingRequest;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
@Singleton
|
||||
public class PingMonitorActor extends RecordActorPrototype {
|
||||
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final int MAX_ATTEMPTS = 3;
|
||||
private final String inboxName;
|
||||
private final ProcessService.ProcessId processId;
|
||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||
private final int node;
|
||||
private final boolean isPrimaryNode;
|
||||
private final Gson gson;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Monitor(int errorAttempts) implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RESTART)
|
||||
public record Run(int attempts) implements ActorStep {}
|
||||
@Terminal
|
||||
public record Aborted() implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial i -> {
|
||||
PingRequest request = new PingRequest(isPrimaryNode ? "primary": "secondary");
|
||||
|
||||
persistence.sendNewMessage(inboxName, null, null,
|
||||
"PingRequest",
|
||||
gson.toJson(request),
|
||||
null);
|
||||
|
||||
yield new Monitor(0);
|
||||
}
|
||||
case Monitor(int errorAttempts) -> {
|
||||
for (;;) {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||
synchronized (processId) {
|
||||
processId.wait(5000);
|
||||
}
|
||||
|
||||
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||
yield new Monitor(0);
|
||||
}
|
||||
// else continue
|
||||
} else {
|
||||
// Special: Associate this thread with the message so that we can get tracking
|
||||
MqMessageHandlerRegistry.register(messages.getFirst().msgId());
|
||||
|
||||
yield new Run(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
case Run(int attempts) -> {
|
||||
try {
|
||||
long startTime = System.currentTimeMillis();
|
||||
var exec = new TaskExecution();
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
if (exec.isError()) {
|
||||
if (attempts < MAX_ATTEMPTS)
|
||||
yield new Run(attempts + 1);
|
||||
else
|
||||
yield new Error();
|
||||
}
|
||||
else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) {
|
||||
// To avoid boot loops, we transition to error if the process
|
||||
// didn't run for longer than 1 seconds. This might happen if
|
||||
// the process crashes before it can reach the heartbeat and inbox
|
||||
// stages of execution. In this case it would not report having acted
|
||||
// on its message, and the process would be restarted forever without
|
||||
// the attempts counter incrementing.
|
||||
yield new Error("Process terminated within 1 seconds of starting");
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
// We get this exception when the process is cancelled by the user
|
||||
|
||||
processService.kill(processId);
|
||||
setCurrentMessageToDead();
|
||||
|
||||
yield new Aborted();
|
||||
}
|
||||
|
||||
yield new Monitor(attempts);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "Spawns a(n) " + processId + " process and monitors its inbox for messages";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public PingMonitorActor(Gson gson,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) throws SQLException {
|
||||
super(gson);
|
||||
this.gson = gson;
|
||||
this.node = configuration.node();
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||
this.processId = ProcessService.ProcessId.PING;
|
||||
|
||||
this.isPrimaryNode = Set.of(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED)
|
||||
.contains(nodeConfigurationService.get(node).profile());
|
||||
}
|
||||
|
||||
/** Sets the message to dead in the database to avoid
|
||||
* the service respawning on the same task when we
|
||||
* re-enable this actor */
|
||||
private void setCurrentMessageToDead() {
|
||||
try {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty()) // Possibly a race condition where the task is already finished
|
||||
return;
|
||||
|
||||
var theMessage = messages.iterator().next();
|
||||
persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Tried but failed to set the message for " + processId + " to dead", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Encapsulates the execution of the process in a separate thread so that
|
||||
* we can interrupt the thread if the process is cancelled */
|
||||
private class TaskExecution {
|
||||
private final AtomicBoolean error = new AtomicBoolean(false);
|
||||
public TaskExecution() throws ExecutionException, InterruptedException {
|
||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
processService.trigger(processId);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Error in triggering process", e);
|
||||
error.set(true);
|
||||
}
|
||||
}).get(); // Wait for the process to start
|
||||
}
|
||||
|
||||
public boolean isError() {
|
||||
return error.get();
|
||||
}
|
||||
}
|
||||
}
|
@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
|
||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
||||
this(crawlId, destId, size, name, -1);
|
||||
public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
|
||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
|
||||
this(crawlId, destId, size, name, ctFilter,-1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
||||
case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||
"crawl-sample-export",
|
||||
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
||||
);
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id(), size, name);
|
||||
yield new Run(crawlId, storage.id(), size, ctFilter, name);
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
|
||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
|
||||
yield new Run(crawlId, destId, size, name, newMsgId);
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
|
||||
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||
}
|
||||
case Run(_, FileStorageId destId, _, _, long msgId) -> {
|
||||
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export RSS/Atom feeds from crawl data";
|
||||
return "Export sample crawl data";
|
||||
}
|
||||
|
||||
@Inject
|
||||
|
@@ -0,0 +1,53 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
public record Run() implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
yield new Error("This actor can only run on node 1");
|
||||
}
|
||||
else {
|
||||
yield new Run();
|
||||
}
|
||||
}
|
||||
case Run() -> {
|
||||
nsfwDomainFilter.fetchLists();
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Sync NSFW filters";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public UpdateNsfwFiltersActor(Gson gson,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NsfwDomainFilter nsfwDomainFilter)
|
||||
{
|
||||
super(gson);
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
}
|
||||
|
||||
}
|
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
|
||||
new ExportSampleDataActor.Export(
|
||||
FileStorageId.of(request.getFileStorageId()),
|
||||
request.getSize(),
|
||||
request.getCtFilter(),
|
||||
request.getName()
|
||||
)
|
||||
);
|
||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ping.PingMain;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.task.ExportTasksMain;
|
||||
@@ -41,6 +42,7 @@ public class ProcessService {
|
||||
return switch (id) {
|
||||
case "converter" -> ProcessId.CONVERTER;
|
||||
case "crawler" -> ProcessId.CRAWLER;
|
||||
case "ping" -> ProcessId.PING;
|
||||
case "loader" -> ProcessId.LOADER;
|
||||
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||
@@ -50,6 +52,7 @@ public class ProcessService {
|
||||
|
||||
public enum ProcessId {
|
||||
CRAWLER(CrawlerMain.class),
|
||||
PING(PingMain.class),
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
@@ -68,6 +71,7 @@ public class ProcessService {
|
||||
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||
case PING -> "PING_PROCESS_OPTS";
|
||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||
};
|
||||
|
@@ -27,10 +27,12 @@ public class DbBrowseDomainsRandom {
|
||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
SELECT EC_RANDOM_DOMAINS.DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION DAI ON DAI.DOMAIN_ID=EC_RANDOM_DOMAINS.DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
AND SERVER_AVAILABLE
|
||||
AND DOMAIN_SET=?
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY RAND()
|
||||
|
@@ -25,9 +25,9 @@ dependencies {
|
||||
|
||||
implementation project(':code:execution:api')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
implementation project(':third-party:rssreader')
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation project(':third-party:rssreader')
|
||||
implementation libs.opencsv
|
||||
implementation libs.slop
|
||||
implementation libs.sqlite
|
||||
@@ -57,8 +57,6 @@ dependencies {
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.domsample;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.livecapture.BrowserlessClient;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Duration;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class DomSampleService {
|
||||
private final DomSampleDb db;
|
||||
private final HikariDataSource mariadbDataSource;
|
||||
private final URI browserlessURI;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||
|
||||
@Inject
|
||||
public DomSampleService(DomSampleDb db,
|
||||
HikariDataSource mariadbDataSource,
|
||||
@Named("browserless-uri") String browserlessAddress,
|
||||
ServiceConfiguration serviceConfiguration)
|
||||
throws URISyntaxException
|
||||
{
|
||||
this.db = db;
|
||||
this.mariadbDataSource = mariadbDataSource;
|
||||
|
||||
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||
logger.warn("Live capture service will not run");
|
||||
browserlessURI = null;
|
||||
}
|
||||
else {
|
||||
browserlessURI = new URI(browserlessAddress);
|
||||
}
|
||||
}
|
||||
|
||||
public void start() {
|
||||
if (browserlessURI == null) {
|
||||
logger.warn("DomSampleService is not enabled due to missing browserless URI or multi-node configuration");
|
||||
return;
|
||||
}
|
||||
|
||||
Thread.ofPlatform().daemon().start(this::run);
|
||||
}
|
||||
|
||||
public void syncDomains() {
|
||||
Set<String> dbDomains = new HashSet<>();
|
||||
|
||||
logger.info("Fetching domains from database...");
|
||||
|
||||
try (var conn = mariadbDataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY>0
|
||||
""")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
dbDomains.add(rs.getString("DOMAIN_NAME"));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to sync domains", e);
|
||||
}
|
||||
|
||||
logger.info("Found {} domains in database", dbDomains.size());
|
||||
|
||||
db.syncDomains(dbDomains);
|
||||
|
||||
logger.info("Synced domains to sqlite");
|
||||
}
|
||||
|
||||
public void run() {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
||||
while (!Thread.currentThread().isInterrupted()) {
|
||||
|
||||
try {
|
||||
// Grace sleep in case we're operating on an empty domain list
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
|
||||
syncDomains();
|
||||
var domains = db.getScheduledDomains();
|
||||
|
||||
for (var domain : domains) {
|
||||
updateDomain(client, domain);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.info("DomSampleService interrupted, stopping...");
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error in DomSampleService run loop", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void updateDomain(BrowserlessClient client, String domain) {
|
||||
var rootUrl = "https://" + domain + "/";
|
||||
try {
|
||||
var content = client.annotatedContent(rootUrl, new BrowserlessClient.GotoOptions("load", Duration.ofSeconds(10).toMillis()));
|
||||
|
||||
if (content.isPresent()) {
|
||||
db.saveSample(domain, rootUrl, content.get());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to process domain: " + domain, e);
|
||||
}
|
||||
finally {
|
||||
db.flagDomainAsFetched(domain);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,174 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class DomSampleDb implements AutoCloseable {
|
||||
private static final String dbFileName = "dom-sample.db";
|
||||
private final Connection connection;
|
||||
|
||||
public DomSampleDb() throws SQLException{
|
||||
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||
}
|
||||
|
||||
public DomSampleDb(Path dbPath) throws SQLException {
|
||||
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||
|
||||
connection = DriverManager.getConnection(dbUrl);
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||
stmt.execute("PRAGMA journal_mode=WAL");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void syncDomains(Set<String> domains) {
|
||||
Set<String> currentDomains = new HashSet<>();
|
||||
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
currentDomains.add(rs.getString("domain"));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to sync domains", e);
|
||||
}
|
||||
|
||||
Set<String> toRemove = new HashSet<>(currentDomains);
|
||||
Set<String> toAdd = new HashSet<>(domains);
|
||||
|
||||
toRemove.removeAll(domains);
|
||||
toAdd.removeAll(currentDomains);
|
||||
|
||||
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
|
||||
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
|
||||
) {
|
||||
for (String domain : toRemove) {
|
||||
removeStmt.setString(1, domain);
|
||||
removeStmt.executeUpdate();
|
||||
}
|
||||
|
||||
for (String domain : toAdd) {
|
||||
addStmt.setString(1, domain);
|
||||
addStmt.executeUpdate();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to remove domains", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getScheduledDomains() {
|
||||
List<String> domains = new ArrayList<>();
|
||||
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
domains.add(rs.getString("domain"));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to get scheduled domains", e);
|
||||
}
|
||||
return domains;
|
||||
}
|
||||
|
||||
public void flagDomainAsFetched(String domain) {
|
||||
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to flag domain as fetched", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||
|
||||
public List<Sample> getSamples(String domain) throws SQLException {
|
||||
List<Sample> samples = new ArrayList<>();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT url, sample, requests, accepted_popover
|
||||
FROM samples
|
||||
WHERE domain = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
samples.add(
|
||||
new Sample(
|
||||
rs.getString("url"),
|
||||
domain,
|
||||
rs.getString("sample"),
|
||||
rs.getString("requests"),
|
||||
rs.getBoolean("accepted_popover")
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||
var doc = Jsoup.parse(rawContent);
|
||||
|
||||
var networkRequests = doc.getElementById("marginalia-network-requests");
|
||||
|
||||
boolean acceptedPopover = false;
|
||||
|
||||
StringBuilder requestTsv = new StringBuilder();
|
||||
if (networkRequests != null) {
|
||||
|
||||
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
|
||||
|
||||
for (var request : networkRequests.getElementsByClass("network-request")) {
|
||||
String method = request.attr("data-method");
|
||||
String urlAttr = request.attr("data-url");
|
||||
String timestamp = request.attr("data-timestamp");
|
||||
|
||||
requestTsv
|
||||
.append(method)
|
||||
.append('\t')
|
||||
.append(timestamp)
|
||||
.append('\t')
|
||||
.append(urlAttr.replace('\n', ' '))
|
||||
.append("\n");
|
||||
}
|
||||
|
||||
networkRequests.remove();
|
||||
}
|
||||
|
||||
doc.body().removeAttr("id");
|
||||
|
||||
String sample = doc.html();
|
||||
|
||||
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
|
||||
|
||||
}
|
||||
|
||||
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE
|
||||
INTO samples (domain, url, sample, requests, accepted_popover)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.setString(2, url);
|
||||
stmt.setString(3, sample);
|
||||
stmt.setString(4, requests);
|
||||
stmt.setBoolean(5, acceptedPopover);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
}
|
||||
}
|
@@ -8,10 +8,13 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URLEncoder;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
@@ -60,6 +63,42 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
|
||||
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
|
||||
*/
|
||||
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"gotoOptions", gotoOptions,
|
||||
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||
);
|
||||
|
||||
// Launch parameters for the browserless instance to load the extension
|
||||
Map<String, Object> launchParameters = Map.of(
|
||||
"args", List.of("--load-extension=/dom-export")
|
||||
);
|
||||
|
||||
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
.header("Content-type", "application/json")
|
||||
.build();
|
||||
|
||||
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
if (rsp.statusCode() >= 300) {
|
||||
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
|
@@ -126,7 +126,6 @@ public class LiveCaptureGrpcService
|
||||
}
|
||||
else {
|
||||
EdgeDomain domain = domainNameOpt.get();
|
||||
String domainNameStr = domain.toString();
|
||||
|
||||
if (!isValidDomainForCapture(domain)) {
|
||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||
|
@@ -79,9 +79,17 @@ public class SimpleFeedParser {
|
||||
if (!link.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
|
||||
if (tag != null) {
|
||||
link = tag.text();
|
||||
String linkText = tag.text();
|
||||
|
||||
if (linkText.isBlank()) {
|
||||
linkText = tag.attr("href");
|
||||
}
|
||||
|
||||
link = linkText;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ret.add(new ItemData(title, description, link, pubDate));
|
||||
|
@@ -0,0 +1,113 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class DomSampleDbTest {
|
||||
Path tempDir;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
tempDir = Files.createTempDirectory("test");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetUp() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to set up database: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSyncDomains() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
|
||||
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
db.syncDomains(Set.of("example.com", "test.com"));
|
||||
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
db.syncDomains(Set.of("foobar.com", "test.com"));
|
||||
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to sync domains: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFetchDomains() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
|
||||
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||
db.flagDomainAsFetched("example.com");
|
||||
db.flagDomainAsFetched("test.com");
|
||||
db.flagDomainAsFetched("foobar.com");
|
||||
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
|
||||
db.flagDomainAsFetched("test.com");
|
||||
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to sync domains: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void saveLoadSingle() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
|
||||
var samples = db.getSamples("example.com");
|
||||
assertEquals(1, samples.size());
|
||||
var sample = samples.getFirst();
|
||||
assertEquals("example.com", sample.domain());
|
||||
assertEquals("http://example.com/sample", sample.url());
|
||||
assertEquals("sample data", sample.sample());
|
||||
assertEquals("requests data", sample.requests());
|
||||
assertTrue(sample.acceptedPopover());
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to save/load sample: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void saveLoadTwo() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
|
||||
var samples = db.getSamples("example.com");
|
||||
assertEquals(2, samples.size());
|
||||
|
||||
Map<String, String> samplesByUrl = new HashMap<>();
|
||||
for (var sample : samples) {
|
||||
samplesByUrl.put(sample.url(), sample.sample());
|
||||
}
|
||||
|
||||
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
|
||||
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to save/load sample: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
@@ -3,17 +3,21 @@ package nu.marginalia.livecapture;
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.images.PullPolicy;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
|
||||
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
@@ -22,9 +26,14 @@ import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
public class BrowserlessClientTest {
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||
// Run gradle docker if this image is not available
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
|
||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||
.withImagePullPolicy(PullPolicy.defaultPolicy())
|
||||
.withNetworkMode("bridge")
|
||||
.withLogConsumer(frame -> {
|
||||
System.out.print(frame.getUtf8String());
|
||||
})
|
||||
.withExposedPorts(3000);
|
||||
|
||||
static WireMockServer wireMockServer =
|
||||
@@ -34,6 +43,7 @@ public class BrowserlessClientTest {
|
||||
static String localIp;
|
||||
|
||||
static URI browserlessURI;
|
||||
static URI browserlessWssURI;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() throws IOException {
|
||||
@@ -44,6 +54,12 @@ public class BrowserlessClientTest {
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
|
||||
container.getHost(),
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
|
||||
wireMockServer.start();
|
||||
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||
|
||||
@@ -85,6 +101,30 @@ public class BrowserlessClientTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnnotatedContent() throws Exception {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI);
|
||||
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||
) {
|
||||
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||
dbop.saveSample("marginalia.nu", "https://marginalia.nu/", content);
|
||||
System.out.println(content);
|
||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||
|
||||
dbop.getSamples("marginalia.nu").forEach(sample -> {
|
||||
System.out.println("Sample URL: " + sample.url());
|
||||
System.out.println("Sample Content: " + sample.sample());
|
||||
System.out.println("Sample Requests: " + sample.requests());
|
||||
System.out.println("Accepted Popover: " + sample.acceptedPopover());
|
||||
});
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScreenshot() throws Exception {
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
@@ -0,0 +1,43 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
implementation libs.commons.lang3
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation project(':code:common:service')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
@@ -0,0 +1,192 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
@Singleton
|
||||
public class NsfwDomainFilter {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final List<String> dangerLists;
|
||||
private final List<String> smutLists;
|
||||
|
||||
private volatile IntOpenHashSet blockedDomainIdsTier1 = new IntOpenHashSet();
|
||||
private volatile IntOpenHashSet blockedDomainIdsTier2 = new IntOpenHashSet();
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NsfwDomainFilter.class);
|
||||
|
||||
public static final int NSFW_DISABLE = 0;
|
||||
public static final int NSFW_BLOCK_DANGER = 1;
|
||||
public static final int NSFW_BLOCK_SMUT = 2;
|
||||
|
||||
@Inject
|
||||
public NsfwDomainFilter(HikariDataSource dataSource,
|
||||
@Named("nsfw.dangerLists") List<String> dangerLists,
|
||||
@Named("nsfw.smutLists") List<String> smutLists
|
||||
) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.dangerLists = dangerLists;
|
||||
this.smutLists = smutLists;
|
||||
|
||||
Thread.ofPlatform().daemon().name("NsfwDomainFilterSync").start(() -> {
|
||||
while (true) {
|
||||
sync();
|
||||
try {
|
||||
TimeUnit.HOURS.sleep(1);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break; // Exit the loop if interrupted
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public boolean isBlocked(int domainId, int tier) {
|
||||
if (tier == 0)
|
||||
return false;
|
||||
|
||||
if (tier >= 1 && blockedDomainIdsTier1.contains(domainId))
|
||||
return true;
|
||||
if (tier >= 2 && blockedDomainIdsTier2.contains(domainId))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private synchronized void sync() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT ID, TIER FROM NSFW_DOMAINS")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
IntOpenHashSet tier1 = new IntOpenHashSet();
|
||||
IntOpenHashSet tier2 = new IntOpenHashSet();
|
||||
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
int tier = rs.getInt("TIER");
|
||||
|
||||
switch (tier) {
|
||||
case 1 -> tier1.add(domainId);
|
||||
case 2 -> tier2.add(domainId);
|
||||
}
|
||||
}
|
||||
|
||||
this.blockedDomainIdsTier1 = tier1;
|
||||
this.blockedDomainIdsTier2 = tier2;
|
||||
|
||||
logger.info("NSFW domain filter synced: {} tier 1, {} tier 2", tier1.size(), tier2.size());
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to sync NSFW domain filter", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void fetchLists() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.followRedirects(HttpClient.Redirect.ALWAYS)
|
||||
.build();
|
||||
var stmt = conn.createStatement();
|
||||
var insertStmt = conn.prepareStatement("INSERT IGNORE INTO NSFW_DOMAINS_TMP (ID, TIER) SELECT ID, ? FROM EC_DOMAIN WHERE DOMAIN_NAME = ?")) {
|
||||
|
||||
stmt.execute("DROP TABLE IF EXISTS NSFW_DOMAINS_TMP");
|
||||
stmt.execute("CREATE TABLE NSFW_DOMAINS_TMP LIKE NSFW_DOMAINS");
|
||||
|
||||
List<String> combinedDangerList = new ArrayList<>(10_000);
|
||||
for (var dangerListUrl : dangerLists) {
|
||||
combinedDangerList.addAll(fetchList(client, dangerListUrl));
|
||||
}
|
||||
|
||||
for (String domain : combinedDangerList) {
|
||||
insertStmt.setInt(1, NSFW_BLOCK_DANGER);
|
||||
insertStmt.setString(2, domain);
|
||||
insertStmt.execute();
|
||||
}
|
||||
|
||||
List<String> combinedSmutList = new ArrayList<>(10_000);
|
||||
for (var smutListUrl : smutLists) {
|
||||
combinedSmutList.addAll(fetchList(client, smutListUrl));
|
||||
}
|
||||
|
||||
for (String domain : combinedSmutList) {
|
||||
insertStmt.setInt(1, NSFW_BLOCK_SMUT);
|
||||
insertStmt.setString(2, domain);
|
||||
insertStmt.addBatch();
|
||||
insertStmt.execute();
|
||||
}
|
||||
|
||||
stmt.execute("""
|
||||
DROP TABLE IF EXISTS NSFW_DOMAINS
|
||||
""");
|
||||
stmt.execute("""
|
||||
RENAME TABLE NSFW_DOMAINS_TMP TO NSFW_DOMAINS
|
||||
""");
|
||||
sync();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch NSFW domain lists", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> fetchList(HttpClient client, String url) {
|
||||
|
||||
logger.info("Fetching NSFW domain list from {}", url);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(java.net.URI.create(url))
|
||||
.build();
|
||||
|
||||
try {
|
||||
if (url.endsWith(".gz")) {
|
||||
var response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
byte[] body = response.body();
|
||||
|
||||
try (var reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new ByteArrayInputStream(body))))) {
|
||||
return reader.lines()
|
||||
.filter(StringUtils::isNotEmpty)
|
||||
.toList();
|
||||
} catch (Exception e) {
|
||||
logger.error("Error reading GZIP response from {}", url, e);
|
||||
}
|
||||
} else {
|
||||
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
if (response.statusCode() == 200) {
|
||||
|
||||
return Arrays.stream(StringUtils.split(response.body(), "\n"))
|
||||
.filter(StringUtils::isNotEmpty)
|
||||
.toList();
|
||||
} else {
|
||||
logger.warn("Failed to fetch list from {}: HTTP {}", url, response.statusCode());
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error fetching NSFW domain list from {}", url, e);
|
||||
}
|
||||
|
||||
|
||||
return List.of();
|
||||
}
|
||||
}
|
@@ -0,0 +1,30 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import jakarta.inject.Named;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class NsfwFilterModule extends AbstractModule {
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.dangerLists")
|
||||
public List<String> nsfwDomainLists1() {
|
||||
return List.of(
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/cryptojacking/domains",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/malware/domains",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/phishing/domains"
|
||||
);
|
||||
}
|
||||
@Provides
|
||||
@Named("nsfw.smutLists")
|
||||
public List<String> nsfwDomainLists2() {
|
||||
return List.of(
|
||||
"https://github.com/olbat/ut1-blacklists/raw/refs/heads/master/blacklists/adult/domains.gz",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/gambling/domains"
|
||||
);
|
||||
}
|
||||
|
||||
public void configure() {}
|
||||
}
|
@@ -0,0 +1,108 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Provides;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
class NsfwDomainFilterTest extends AbstractModule {
|
||||
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
static Path tempDir;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpDb() throws IOException {
|
||||
tempDir = Files.createTempDirectory(NsfwDomainFilterTest.class.getSimpleName());
|
||||
|
||||
System.setProperty("system.homePath", tempDir.toString());
|
||||
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, 1)")
|
||||
) {
|
||||
|
||||
// Ensure the database is ready
|
||||
conn.createStatement().execute("SELECT 1");
|
||||
|
||||
stmt.setString(1, "www.google.com");
|
||||
stmt.setString(2, "google.com");
|
||||
stmt.executeUpdate();
|
||||
stmt.setString(1, "www.bing.com");
|
||||
stmt.setString(2, "bing.com");
|
||||
stmt.executeUpdate();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to connect to the database", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.dangerLists")
|
||||
public List<String> nsfwDomainLists1() {
|
||||
return List.of(
|
||||
"https://downloads.marginalia.nu/test/list1"
|
||||
);
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.smutLists")
|
||||
public List<String> nsfwDomainLists2() {
|
||||
return List.of(
|
||||
"https://downloads.marginalia.nu/test/list2.gz"
|
||||
);
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
var filter = Guice
|
||||
.createInjector(this)
|
||||
.getInstance(NsfwDomainFilter.class);
|
||||
|
||||
filter.fetchLists();
|
||||
|
||||
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||
assertFalse(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||
assertTrue(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||
}
|
||||
|
||||
}
|
@@ -1,9 +1,6 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
@@ -32,6 +29,8 @@ public class QueryProtobufCodec {
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(request.getHumanQuery());
|
||||
|
||||
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
|
||||
|
||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
@@ -78,6 +77,8 @@ public class QueryProtobufCodec {
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(humanQuery);
|
||||
|
||||
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
|
||||
|
||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
@@ -112,6 +113,7 @@ public class QueryProtobufCodec {
|
||||
request.getSearchSetIdentifier(),
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()),
|
||||
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
||||
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
|
||||
request.getPagination().getPage()
|
||||
);
|
||||
}
|
||||
@@ -327,6 +329,7 @@ public class QueryProtobufCodec {
|
||||
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
|
||||
.setSearchSetIdentifier(params.identifier())
|
||||
.setQueryStrategy(params.queryStrategy().name())
|
||||
.setNsfwFilterTierValue(params.filterTier().getCodedValue())
|
||||
.setTemporalBias(RpcTemporalBias.newBuilder()
|
||||
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
|
||||
.build())
|
||||
|
@@ -0,0 +1,26 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
public enum NsfwFilterTier {
|
||||
OFF(0),
|
||||
DANGER(1),
|
||||
PORN_AND_GAMBLING(2);
|
||||
|
||||
private final int codedValue; // same as ordinal() for now, but can be changed later if needed
|
||||
|
||||
NsfwFilterTier(int codedValue) {
|
||||
this.codedValue = codedValue;
|
||||
}
|
||||
|
||||
public static NsfwFilterTier fromCodedValue(int codedValue) {
|
||||
for (NsfwFilterTier tier : NsfwFilterTier.values()) {
|
||||
if (tier.codedValue == codedValue) {
|
||||
return tier;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Invalid coded value for NsfwFilterTirer: " + codedValue);
|
||||
}
|
||||
|
||||
public int getCodedValue() {
|
||||
return codedValue;
|
||||
}
|
||||
}
|
@@ -25,10 +25,11 @@ public record QueryParams(
|
||||
String identifier,
|
||||
QueryStrategy queryStrategy,
|
||||
RpcTemporalBias.Bias temporalBias,
|
||||
NsfwFilterTier filterTier,
|
||||
int page
|
||||
)
|
||||
{
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier) {
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
|
||||
this(query, null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
@@ -43,6 +44,7 @@ public record QueryParams(
|
||||
identifier,
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
filterTier,
|
||||
1 // page
|
||||
);
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
@@ -161,4 +162,14 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
public String toString() {
|
||||
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
||||
}
|
||||
|
||||
public String getShortFormat() {
|
||||
try {
|
||||
var df = DocumentFormat.valueOf(format);
|
||||
return df.shortFormat;
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
return DocumentFormat.UNKNOWN.shortFormat;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -32,6 +32,14 @@ message RpcQsQuery {
|
||||
RpcTemporalBias temporalBias = 16;
|
||||
|
||||
RpcQsQueryPagination pagination = 17;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 18;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
DANGER = 1;
|
||||
PORN_AND_GAMBLING = 2;
|
||||
};
|
||||
}
|
||||
|
||||
/* Query service query response */
|
||||
@@ -78,8 +86,17 @@ message RpcIndexQuery {
|
||||
RpcQueryLimits queryLimits = 10;
|
||||
string queryStrategy = 11; // Named query configuration
|
||||
RpcResultRankingParameters parameters = 12;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 13;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
DANGER = 1;
|
||||
PORN_AND_GAMBLING = 2;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/* A tagged union encoding some limit on a field */
|
||||
message RpcSpecLimit {
|
||||
int32 value = 1;
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation project(':code:index:query')
|
||||
|
@@ -11,6 +11,7 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.api.IndexClient;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -34,13 +35,16 @@ public class QueryGRPCService
|
||||
|
||||
|
||||
private final QueryFactory queryFactory;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
private final IndexClient indexClient;
|
||||
|
||||
@Inject
|
||||
public QueryGRPCService(QueryFactory queryFactory,
|
||||
NsfwDomainFilter nsfwDomainFilter,
|
||||
IndexClient indexClient)
|
||||
{
|
||||
this.queryFactory = queryFactory;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
this.indexClient = indexClient;
|
||||
}
|
||||
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.query.svc;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@@ -58,6 +59,7 @@ public class QueryFactoryTest {
|
||||
"NONE",
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.OFF,
|
||||
0), null).specs;
|
||||
}
|
||||
|
||||
|
@@ -17,6 +17,7 @@ dependencies {
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
@@ -2,11 +2,13 @@ package nu.marginalia.index.api;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Counter;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -28,14 +30,26 @@ public class IndexClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
|
||||
Counter wmsa_index_query_count = Counter.build()
|
||||
.name("wmsa_nsfw_filter_result_count")
|
||||
.labelNames("tier")
|
||||
.help("Count of results filtered by NSFW tier")
|
||||
.register();
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||
DomainBlacklistImpl blacklist,
|
||||
NsfwDomainFilter nsfwDomainFilter
|
||||
) {
|
||||
this.channelPool = channelPoolFactory.createMulti(
|
||||
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
||||
IndexApiGrpc::newBlockingStub);
|
||||
this.blacklist = blacklist;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
}
|
||||
|
||||
private static final Comparator<RpcDecoratedResultItem> comparator =
|
||||
@@ -52,7 +66,7 @@ public class IndexClient {
|
||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||
|
||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||
|
||||
int filterTier = indexRequest.getNsfwFilterTierValue();
|
||||
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||
|
||||
List<RpcDecoratedResultItem> results =
|
||||
@@ -74,7 +88,7 @@ public class IndexClient {
|
||||
}
|
||||
})
|
||||
.flatMap(List::stream)
|
||||
.filter(item -> !isBlacklisted(item))
|
||||
.filter(item -> !isBlacklisted(item, filterTier))
|
||||
.sorted(comparator)
|
||||
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||
.limit(pagination.pageSize)
|
||||
@@ -83,8 +97,23 @@ public class IndexClient {
|
||||
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||
}
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
|
||||
static String[] tierNames = {
|
||||
"OFF",
|
||||
"DANGER",
|
||||
"NSFW"
|
||||
};
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item, int filterTier) {
|
||||
int domainId = UrlIdCodec.getDomainId(item.getRawItem().getCombinedId());
|
||||
|
||||
if (blacklist.isBlacklisted(domainId)) {
|
||||
return true;
|
||||
}
|
||||
if (nsfwDomainFilter.isBlocked(domainId, filterTier)) {
|
||||
wmsa_index_query_count.labels(tierNames[filterTier]).inc();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
|
||||
|
||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||
|
||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
||||
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
|
||||
for (var instance : journal.pages()) {
|
||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
||||
{
|
||||
|
@@ -15,6 +15,10 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.opencsv
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.geoip;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.geoip.sources.AsnMapping;
|
||||
import nu.marginalia.geoip.sources.AsnTable;
|
||||
@@ -10,6 +11,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.net.InetAddress;
|
||||
import java.util.Optional;
|
||||
|
||||
@Singleton
|
||||
public class GeoIpDictionary {
|
||||
private volatile IP2LocationMapping ip2locMapping = null;
|
||||
private volatile AsnTable asnTable = null;
|
||||
@@ -76,7 +78,7 @@ public class GeoIpDictionary {
|
||||
}
|
||||
|
||||
public Optional<AsnTable.AsnInfo> getAsnInfo(int ipAddress) {
|
||||
if (null == asnTable) { // not loaded yet or failed to load
|
||||
if (null == asnMapping || null == asnTable) { // not loaded yet or failed to load
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
@@ -62,6 +62,7 @@ dependencies {
|
||||
implementation libs.jwarc
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation libs.pdfbox
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
|
@@ -1,8 +1,8 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
@@ -21,7 +21,7 @@ public class ProcessedDocumentDetails {
|
||||
public long hashCode;
|
||||
|
||||
public Set<HtmlFeature> features;
|
||||
public HtmlStandard standard;
|
||||
public DocumentFormat format;
|
||||
|
||||
public List<EdgeUrl> linksInternal;
|
||||
public List<EdgeUrl> linksExternal;
|
||||
@@ -30,6 +30,6 @@ public class ProcessedDocumentDetails {
|
||||
public GeneratorType generator;
|
||||
|
||||
public String toString() {
|
||||
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
||||
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.format + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
||||
}
|
||||
}
|
||||
|
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||
import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
|
||||
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@@ -33,7 +34,8 @@ public class DocumentProcessor {
|
||||
private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||
"application/xhtml",
|
||||
"text/html",
|
||||
"text/plain");
|
||||
"text/plain",
|
||||
"application/pdf");
|
||||
|
||||
|
||||
private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
|
||||
@@ -42,12 +44,14 @@ public class DocumentProcessor {
|
||||
@Inject
|
||||
public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
|
||||
PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
|
||||
PdfDocumentProcessorPlugin pdfDocumentProcessorPlugin,
|
||||
AnchorTextKeywords anchorTextKeywords)
|
||||
{
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
|
||||
processorPlugins.add(htmlDocumentProcessorPlugin);
|
||||
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
||||
processorPlugins.add(pdfDocumentProcessorPlugin);
|
||||
}
|
||||
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument,
|
||||
|
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import crawlercommons.utils.Strings;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -17,7 +17,7 @@ import java.util.Set;
|
||||
public class DocumentValuator {
|
||||
|
||||
public double getQuality(CrawledDocument crawledDocument,
|
||||
HtmlStandard htmlStandard,
|
||||
DocumentFormat htmlStandard,
|
||||
Document parsedDocument,
|
||||
int textLength) throws DisqualifiedException {
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.DocumentType;
|
||||
import org.slf4j.Logger;
|
||||
@@ -12,54 +12,54 @@ public class HtmlStandardExtractor {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
||||
|
||||
public static HtmlStandard parseDocType(DocumentType docType) {
|
||||
public static DocumentFormat parseDocType(DocumentType docType) {
|
||||
if (null == docType) {
|
||||
return HtmlStandard.UNKNOWN;
|
||||
return DocumentFormat.UNKNOWN;
|
||||
}
|
||||
|
||||
String publicId = docType.publicId();
|
||||
if (Strings.isNullOrEmpty(publicId))
|
||||
return HtmlStandard.HTML5;
|
||||
return DocumentFormat.HTML5;
|
||||
|
||||
publicId = publicId.toUpperCase();
|
||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
||||
return HtmlStandard.HTML4;
|
||||
return DocumentFormat.HTML4;
|
||||
}
|
||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
}
|
||||
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
||||
return HtmlStandard.XHTML;
|
||||
return DocumentFormat.XHTML;
|
||||
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
||||
return HtmlStandard.XHTML;
|
||||
return DocumentFormat.XHTML;
|
||||
if (publicId.startsWith("-//W3C//DTD HTML"))
|
||||
return HtmlStandard.HTML4;
|
||||
return DocumentFormat.HTML4;
|
||||
|
||||
logger.debug("Unknown publicID standard {}", publicId);
|
||||
return HtmlStandard.UNKNOWN;
|
||||
return DocumentFormat.UNKNOWN;
|
||||
}
|
||||
|
||||
public static HtmlStandard sniffHtmlStandard(Document parsed) {
|
||||
public static DocumentFormat sniffHtmlStandard(Document parsed) {
|
||||
int html4Attributes = 0;
|
||||
int html5Attributes = 0;
|
||||
|
||||
@@ -73,11 +73,11 @@ public class HtmlStandardExtractor {
|
||||
html4Attributes++;
|
||||
}
|
||||
if (html5Attributes > 0) {
|
||||
return HtmlStandard.HTML5;
|
||||
return DocumentFormat.HTML5;
|
||||
}
|
||||
if (html4Attributes > 0) {
|
||||
return HtmlStandard.HTML4;
|
||||
return DocumentFormat.HTML4;
|
||||
}
|
||||
return HtmlStandard.HTML123;
|
||||
return DocumentFormat.HTML123;
|
||||
}
|
||||
}
|
||||
|
@@ -7,11 +7,11 @@ import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.IOException;
|
||||
@@ -73,7 +73,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
||||
return this;
|
||||
}
|
||||
|
||||
public MetaTagsBuilder addFormat(HtmlStandard standard) {
|
||||
public MetaTagsBuilder addFormat(DocumentFormat standard) {
|
||||
|
||||
add("format", standard);
|
||||
|
||||
|
@@ -25,12 +25,12 @@ import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.link_parser.FeedExtractor;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import org.jsoup.nodes.Document;
|
||||
@@ -137,8 +137,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
|
||||
final int length = getLength(doc);
|
||||
final HtmlStandard standard = getHtmlStandard(doc);
|
||||
final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
|
||||
final DocumentFormat format = getDocumentFormat(doc);
|
||||
final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);
|
||||
|
||||
if (isDisqualified(documentClass, url, quality, doc.title())) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
@@ -152,7 +152,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
ret.length = length;
|
||||
ret.standard = standard;
|
||||
ret.format = format;
|
||||
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
|
||||
|
||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||
@@ -161,7 +161,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
ret.quality = documentValuator.adjustQuality(quality, features);
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, format, true);
|
||||
|
||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||
|
||||
@@ -180,7 +180,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
.addPubDate(pubDate)
|
||||
.addUrl(url)
|
||||
.addFeatures(features)
|
||||
.addFormat(standard)
|
||||
.addFormat(format)
|
||||
.addGenerator(generatorParts.keywords())
|
||||
.build();
|
||||
|
||||
@@ -316,12 +316,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
return linkTerms;
|
||||
}
|
||||
|
||||
private HtmlStandard getHtmlStandard(Document doc) {
|
||||
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
||||
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
||||
private DocumentFormat getDocumentFormat(Document doc) {
|
||||
DocumentFormat format = HtmlStandardExtractor.parseDocType(doc.documentType());
|
||||
if (DocumentFormat.UNKNOWN.equals(format)) {
|
||||
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
||||
}
|
||||
return htmlStandard;
|
||||
return format;
|
||||
}
|
||||
|
||||
private int getLength(Document doc) {
|
||||
|
@@ -0,0 +1,286 @@
|
||||
package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.text.HeadingAwarePDFTextStripper;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.LocalDate;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||
|
||||
private final int maxTitleLength;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PdfDocumentProcessorPlugin.class);
|
||||
|
||||
@Inject
|
||||
public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||
LanguageFilter languageFilter,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
DocumentLengthLogic documentLengthLogic,
|
||||
DefaultSpecialization defaultSpecialization)
|
||||
|
||||
{
|
||||
super(languageFilter);
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.maxTitleLength = maxTitleLength;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
this.defaultSpecialization = defaultSpecialization;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isApplicable(CrawledDocument doc) {
|
||||
String contentType = doc.contentType.toLowerCase();
|
||||
|
||||
if (contentType.equals("application/pdf"))
|
||||
return true;
|
||||
if (contentType.startsWith("application/pdf;")) // charset=blabla
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||
LinkTexts linkTexts,
|
||||
DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException, IOException {
|
||||
|
||||
String documentBody = crawledDocument.documentBody();
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
|
||||
|
||||
Document doc;
|
||||
try {
|
||||
doc = convertPdfToHtml(crawledDocument.documentBodyBytes);
|
||||
} catch (IOException e) {
|
||||
logger.error("Failed to convert PDF file {} - {}", url, e.getMessage());
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ERROR);
|
||||
}
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
documentLengthLogic.validateLength(dld, 1.0);
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
ret.length = documentBody.length();
|
||||
|
||||
ret.format = DocumentFormat.PDF;
|
||||
ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
|
||||
|
||||
ret.quality = -5;
|
||||
|
||||
ret.features = Set.of(HtmlFeature.PDF);
|
||||
ret.description = getDescription(doc);
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
|
||||
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
||||
|
||||
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PdfFile);
|
||||
|
||||
ret.metadata = new DocumentMetadata(
|
||||
documentLengthLogic.getEncodedAverageLength(dld),
|
||||
pubDate.yearByte(),
|
||||
(int) -ret.quality,
|
||||
documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
.addPubDate(pubDate)
|
||||
.addUrl(url)
|
||||
.addFeatures(ret.features)
|
||||
.addFormat(ret.format)
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
|
||||
if (pubDate.hasYear()) {
|
||||
ret.pubYear = pubDate.year();
|
||||
}
|
||||
|
||||
/* These are assumed to be populated */
|
||||
ret.linksInternal = new ArrayList<>();
|
||||
ret.linksExternal = new ArrayList<>();
|
||||
|
||||
return new DetailsWithWords(ret, words);
|
||||
}
|
||||
|
||||
private String getDescription(Document doc) {
|
||||
int cnt = 0;
|
||||
boolean useNext = false;
|
||||
for (var ptag : doc.getElementsByTag("p")) {
|
||||
String text = ptag.text();
|
||||
|
||||
// Many academic documents have an abstract at the start of the document,
|
||||
// which makes a nice summary. Though they tend to bleed into the text,
|
||||
// so we check for the word "Abstract" at the start of the paragraph.
|
||||
|
||||
if (text.startsWith("Abstract ")) {
|
||||
return StringUtils.abbreviate(text.substring("Abstract ".length()), "...", 255);
|
||||
}
|
||||
else if (text.equals("Abstract")) {
|
||||
useNext = true;
|
||||
}
|
||||
else if (useNext) {
|
||||
return StringUtils.abbreviate(text, "...", 255);
|
||||
}
|
||||
|
||||
if (++cnt > 15) { // Don't scan the entire document
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to the default specialization
|
||||
return defaultSpecialization.getSummary(doc, Set.of());
|
||||
|
||||
}
|
||||
|
||||
/** Convert the provided PDF bytes into a HTML rendering that can be fed
|
||||
* to the HTML processor.
|
||||
*/
|
||||
Document convertPdfToHtml(byte[] pdfBytes) throws IOException {
|
||||
try (var doc = Loader.loadPDF(pdfBytes)) {
|
||||
String docMetaTitle = Objects.requireNonNullElse(doc.getDocumentInformation().getTitle(), "");
|
||||
|
||||
var stripper = new HeadingAwarePDFTextStripper();
|
||||
stripper.setStartPage(1);
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setWordSeparator(" ");
|
||||
|
||||
// Increase the tolerance for line spacing to deal better with paragraphs.
|
||||
stripper.setDropThreshold(5f);
|
||||
|
||||
stripper.setPageStart("<div>");
|
||||
stripper.setParagraphStart("<p>");
|
||||
stripper.setParagraphEnd("</p>\n");
|
||||
stripper.setPageEnd("</div>\n");
|
||||
stripper.setHeadingStart("<h1>");
|
||||
stripper.setHeadingEnd("</h1>\n");
|
||||
stripper.setLineSeparator("\n");
|
||||
|
||||
String text = stripper.getText(doc);
|
||||
|
||||
StringBuilder htmlBuilder = new StringBuilder(text.length() + 1024);
|
||||
htmlBuilder.append("<html><body>")
|
||||
.append(text)
|
||||
.append("</body></html>");
|
||||
|
||||
var parsed = Jsoup.parse(htmlBuilder.toString());
|
||||
|
||||
repairDOM(parsed);
|
||||
|
||||
for (var heading : parsed.getElementsByTag("h1")) {
|
||||
String headingText = heading.text();
|
||||
if (headingText.length() > 2) {
|
||||
parsed.title(headingText);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (parsed.title().isEmpty()) {
|
||||
// Prefer setting the title to the first paragraph in the
|
||||
// document, as this is almost always correct. Otherwise,
|
||||
// we fall back on the metadata title, which is almost always
|
||||
// useless
|
||||
|
||||
var firstP = parsed.getElementsByTag("p").first();
|
||||
if (firstP != null) parsed.title(firstP.text());
|
||||
else parsed.title(docMetaTitle);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/** Repair the DOM to remove some common issues with PDF conversion,
|
||||
* including empty paragraphs, and multiline headers that are split into multiple
|
||||
* conescutive h1 tags.
|
||||
*/
|
||||
private void repairDOM(Document parsed) {
|
||||
|
||||
// <p><h1>...</h1></p> -> <h1>...</h1>
|
||||
parsed.getElementsByTag("h1").forEach(h1 -> {
|
||||
var parent = h1.parent();
|
||||
if (parent == null || !"p".equals(parent.tagName())) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (parent.childrenSize() == 1) {
|
||||
parent.replaceWith(h1);
|
||||
}
|
||||
});
|
||||
|
||||
// Remove empty <p> tags
|
||||
parsed.getElementsByTag("p").forEach(p -> {
|
||||
if (p.childrenSize() == 0 && !p.hasText()) {
|
||||
p.remove();
|
||||
}
|
||||
});
|
||||
|
||||
// <h1>...</h1><h1>...</h1> -> <h1>...</h1>
|
||||
parsed.getElementsByTag("h1").forEach(h1 -> {
|
||||
var nextSibling = h1.nextElementSibling();
|
||||
if (nextSibling == null || !"h1".equals(nextSibling.tagName())) {
|
||||
return; // Short-circuit to avoid unnecessary work
|
||||
}
|
||||
|
||||
StringJoiner joiner = new StringJoiner(" ");
|
||||
joiner.add(h1.text());
|
||||
|
||||
for (var sibling : h1.nextElementSiblings()) {
|
||||
if (!"h1".equals(sibling.tagName()))
|
||||
break;
|
||||
joiner.add(sibling.text());
|
||||
sibling.remove();
|
||||
}
|
||||
|
||||
h1.text(joiner.toString());
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -13,10 +13,10 @@ import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
@@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
ret.length = documentBody.length();
|
||||
|
||||
ret.standard = HtmlStandard.PLAIN;
|
||||
ret.format = DocumentFormat.PLAIN;
|
||||
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
||||
|
||||
ret.quality = -1;
|
||||
@@ -113,7 +113,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
.addPubDate(pubDate)
|
||||
.addUrl(url)
|
||||
.addFeatures(ret.features)
|
||||
.addFormat(ret.standard)
|
||||
.addFormat(ret.format)
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
|
@@ -1,12 +1,13 @@
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
|
||||
public class PubDateFromHtmlStandard {
|
||||
/** Used to bias pub date heuristics */
|
||||
public static int blindGuess(HtmlStandard standard) {
|
||||
return switch (standard) {
|
||||
public static int blindGuess(DocumentFormat format) {
|
||||
return switch (format) {
|
||||
case PLAIN -> 1993;
|
||||
case PDF -> 2010;
|
||||
case HTML123 -> 1997;
|
||||
case HTML4, XHTML -> 2006;
|
||||
case HTML5 -> 2018;
|
||||
@@ -21,8 +22,8 @@ public class PubDateFromHtmlStandard {
|
||||
* Discovering publication year involves a lot of guesswork, this helps
|
||||
* keep the guesses relatively sane.
|
||||
*/
|
||||
public static boolean isGuessPlausible(HtmlStandard standard, int year) {
|
||||
switch (standard) {
|
||||
public static boolean isGuessPlausible(DocumentFormat format, int year) {
|
||||
switch (format) {
|
||||
case HTML123:
|
||||
return year <= 2000;
|
||||
case XHTML:
|
||||
|
@@ -1,14 +1,14 @@
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public interface PubDateHeuristic {
|
||||
|
||||
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
|
||||
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard);
|
||||
}
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
|
||||
import java.time.DateTimeException;
|
||||
import java.time.LocalDate;
|
||||
@@ -26,7 +26,7 @@ public class PubDateParser {
|
||||
.filter(PubDateParser::validateDate);
|
||||
}
|
||||
|
||||
public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
|
||||
public static Optional<PubDate> attemptParseDate(String date, DocumentFormat standard) {
|
||||
return Optional.ofNullable(date)
|
||||
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||
.flatMap(str ->
|
||||
@@ -81,7 +81,7 @@ public class PubDateParser {
|
||||
}
|
||||
|
||||
|
||||
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
|
||||
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, DocumentFormat standard) {
|
||||
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
||||
|
||||
var matcher = yearPattern.matcher(maybe);
|
||||
@@ -135,7 +135,7 @@ public class PubDateParser {
|
||||
return (max + min) / 2;
|
||||
}
|
||||
|
||||
public static int guessYear(HtmlStandard standard) {
|
||||
public static int guessYear(DocumentFormat standard) {
|
||||
// Create some jitter to avoid having documents piling up in the same four years
|
||||
// as this would make searching in those years disproportionately useless
|
||||
|
||||
|
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.pubdate;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@@ -38,7 +38,7 @@ public class PubDateSniffer {
|
||||
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||
}
|
||||
|
||||
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
|
||||
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard, boolean runExpensive) {
|
||||
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
||||
|
||||
for (var heuristic : heuristics) {
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -19,7 +19,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
if (effortLevel == PubDateEffortLevel.LOW)
|
||||
return Optional.empty();
|
||||
|
||||
@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||
|
||||
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
||||
public PubDate pubDate;
|
||||
private final HtmlStandard htmlStandard;
|
||||
private final DocumentFormat htmlStandard;
|
||||
|
||||
private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
|
||||
private DateExtractingNodeVisitorPass(DocumentFormat htmlStandard) {
|
||||
this.htmlStandard = htmlStandard;
|
||||
}
|
||||
|
||||
@@ -135,7 +135,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||
}
|
||||
|
||||
private void parse(String text) {
|
||||
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
||||
if (htmlStandard == DocumentFormat.UNKNOWN) {
|
||||
PubDateParser
|
||||
.dateFromHighestYearLookingSubstring(text)
|
||||
.ifPresent(this::setPubDate);
|
||||
|
@@ -5,9 +5,9 @@ import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Node;
|
||||
@@ -19,7 +19,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
if (effortLevel == PubDateEffortLevel.LOW)
|
||||
return Optional.empty();
|
||||
|
||||
@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||
|
||||
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||
public PubDate pubDate;
|
||||
private final HtmlStandard htmlStandard;
|
||||
private final DocumentFormat htmlStandard;
|
||||
|
||||
private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
|
||||
private DateExtractingNodeVisitor(DocumentFormat htmlStandard) {
|
||||
this.htmlStandard = htmlStandard;
|
||||
}
|
||||
|
||||
@@ -73,7 +73,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||
}
|
||||
|
||||
private void parse(String text) {
|
||||
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
||||
if (htmlStandard == DocumentFormat.UNKNOWN) {
|
||||
PubDateParser
|
||||
.dateFromHighestYearLookingSubstring(text)
|
||||
.ifPresent(this::setPubDate);
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -14,8 +14,8 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
if (htmlStandard == HtmlStandard.UNKNOWN)
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
if (htmlStandard == DocumentFormat.UNKNOWN)
|
||||
return Optional.empty();
|
||||
|
||||
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
// HTML5, alternative approach
|
||||
for (var tag : document.select("time")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
// HTML5
|
||||
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||
if (maybeDate.isPresent()) {
|
||||
|
@@ -8,9 +8,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Collections;
|
||||
@@ -21,7 +21,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
||||
var maybeDate = parseLdJson(tag.data())
|
||||
.flatMap(PubDateParser::attemptParseDate);
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.List;
|
||||
@@ -15,7 +15,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
List<String> lastModified = headers.get("last-modified");
|
||||
if (lastModified.isEmpty())
|
||||
return Optional.empty();
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
|
||||
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
// OG
|
||||
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
||||
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||
if (maybeDate.isPresent()) {
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -21,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
||||
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||
final String urlString = url.path;
|
||||
|
||||
var matcher = yearUrlPattern.matcher(urlString);
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Optional;
|
||||
@@ -19,7 +19,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
||||
|
||||
@Override
|
||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
|
||||
Document document, HtmlStandard htmlStandard) {
|
||||
Document document, DocumentFormat htmlStandard) {
|
||||
final String urlString = url.path;
|
||||
|
||||
var matcher = yearUrlPattern.matcher(urlString);
|
||||
|
@@ -8,12 +8,12 @@ import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
@@ -53,6 +53,7 @@ public class SideloaderProcessing {
|
||||
"",
|
||||
body.getBytes(StandardCharsets.UTF_8),
|
||||
false,
|
||||
-1,
|
||||
null,
|
||||
null
|
||||
);
|
||||
@@ -83,7 +84,7 @@ public class SideloaderProcessing {
|
||||
// that we can't get from the sideloaded data since it's
|
||||
// so stripped down
|
||||
|
||||
ret.details.standard = HtmlStandard.HTML5;
|
||||
ret.details.format = DocumentFormat.HTML5;
|
||||
ret.details.pubYear = pubYear;
|
||||
ret.details.features.add(HtmlFeature.JS);
|
||||
ret.details.features.add(HtmlFeature.TRACKING);
|
||||
|
@@ -9,13 +9,13 @@ import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
@@ -165,7 +165,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
ret.details.description = StringUtils.truncate(doc.body().text(), 255);
|
||||
ret.details.length = 128;
|
||||
|
||||
ret.details.standard = HtmlStandard.HTML5;
|
||||
ret.details.format = DocumentFormat.HTML5;
|
||||
ret.details.linksExternal = List.of();
|
||||
ret.details.linksInternal = List.of();
|
||||
ret.state = UrlIndexingState.OK;
|
||||
|
@@ -124,7 +124,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
||||
document.details.title,
|
||||
document.details.description,
|
||||
HtmlFeature.encode(document.details.features),
|
||||
document.details.standard.name(),
|
||||
document.details.format.name(),
|
||||
document.details.length,
|
||||
document.details.hashCode,
|
||||
(float) document.details.quality,
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -6,6 +6,7 @@ import com.google.inject.Injector;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
@@ -13,7 +14,6 @@ import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -91,7 +91,7 @@ public class ConvertingIntegrationTest {
|
||||
|
||||
assertTrue(details.title.length() > 4);
|
||||
assertTrue(details.description.length() > 4);
|
||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
||||
assertEquals(DocumentFormat.HTML5, details.format);
|
||||
|
||||
}
|
||||
}
|
||||
@@ -125,7 +125,7 @@ public class ConvertingIntegrationTest {
|
||||
assertTrue(details.metadata.size() > 0);
|
||||
assertTrue(details.title.length() > 4);
|
||||
assertTrue(details.description.length() > 4);
|
||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
||||
assertEquals(DocumentFormat.HTML5, details.format);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
|
||||
"",
|
||||
readClassPathFile(p.toString()).getBytes(),
|
||||
false,
|
||||
-1,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
@@ -0,0 +1,95 @@
|
||||
package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||
import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Instant;
|
||||
|
||||
@Tag("flaky")
|
||||
class PdfDocumentProcessorPluginTest {
|
||||
static PdfDocumentProcessorPlugin plugin;
|
||||
|
||||
@BeforeAll
|
||||
static void setUpBeforeClass() throws Exception {
|
||||
var lm = WmsaHome.getLanguageModels();
|
||||
plugin = new PdfDocumentProcessorPlugin(255,
|
||||
new LanguageFilter(lm),
|
||||
new ThreadLocalSentenceExtractorProvider(lm),
|
||||
new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
|
||||
new DocumentLengthLogic(100),
|
||||
new DefaultSpecialization(new SummaryExtractor(
|
||||
255,
|
||||
new DomFilterHeuristic(255),
|
||||
new TagDensityHeuristic(255),
|
||||
new OpenGraphDescriptionHeuristic(),
|
||||
new MetaDescriptionHeuristic(),
|
||||
new FallbackHeuristic()
|
||||
),
|
||||
new TitleExtractor(255)
|
||||
));
|
||||
}
|
||||
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
|
||||
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||
}
|
||||
|
||||
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(Path file) throws Exception {
|
||||
return testPdfFile(Files.readAllBytes(file));
|
||||
}
|
||||
|
||||
private byte[] downloadPDF(String url) throws IOException, URISyntaxException {
|
||||
HttpURLConnection conn = (HttpURLConnection) new URI(url).toURL().openConnection();
|
||||
try {
|
||||
return conn.getInputStream().readAllBytes();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
conn.disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
void testingTool() throws Exception {
|
||||
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample.pdf")).details().title);
|
||||
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample2.pdf")).details().title);
|
||||
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample3.pdf")).details().title);
|
||||
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample4.pdf")).details().title);
|
||||
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample5.pdf")).details().title);
|
||||
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample6.pdf")).details().title);
|
||||
}
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
void testingTool2() throws Exception {
|
||||
System.out.println(plugin.convertPdfToHtml(Files.readAllBytes(Path.of("/home/st_work/Work/sample6.pdf"))));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMarginaliaSample() throws Exception {
|
||||
var doc = plugin.convertPdfToHtml(downloadPDF("https://www.marginalia.nu/junk/test.pdf"));
|
||||
System.out.println(doc.html());
|
||||
}
|
||||
}
|
@@ -3,8 +3,8 @@ package nu.marginalia.converting.processor.pubdate;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||
import nu.marginalia.model.DocumentFormat;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@@ -74,7 +74,7 @@ class PubDateSnifferTest {
|
||||
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
||||
Wow, sure lor 'em boss
|
||||
</article>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2022-08-24", ret.dateIso8601());
|
||||
@@ -90,7 +90,7 @@ class PubDateSnifferTest {
|
||||
<time>2022-08-24</time>
|
||||
Wow, sure lor 'em boss
|
||||
</article>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2022-08-24", ret.dateIso8601());
|
||||
@@ -106,7 +106,7 @@ class PubDateSnifferTest {
|
||||
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
||||
Wow, sure lor 'em boss
|
||||
</article>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals(2006, ret.year());
|
||||
@@ -116,14 +116,14 @@ class PubDateSnifferTest {
|
||||
public void testProblemCases() throws IOException, URISyntaxException {
|
||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
|
||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), DocumentFormat.HTML5, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals(2006, ret.year());
|
||||
|
||||
ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||
new EdgeUrl("https://www.example.com/"),
|
||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
|
||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), DocumentFormat.XHTML, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals(2010, ret.year());
|
||||
@@ -146,7 +146,7 @@ class PubDateSnifferTest {
|
||||
<!doctype html>
|
||||
<html>
|
||||
<meta itemprop="datePublished" content="2022-08-24" />
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2022-08-24", ret.dateIso8601());
|
||||
@@ -160,7 +160,7 @@ class PubDateSnifferTest {
|
||||
<!doctype html>
|
||||
<html>
|
||||
<meta property="datePublished" content="2022-08-24" />
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2022-08-24", ret.dateIso8601());
|
||||
@@ -174,7 +174,7 @@ class PubDateSnifferTest {
|
||||
<!doctype html>
|
||||
<html>
|
||||
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2004-08-24", ret.dateIso8601());
|
||||
@@ -188,7 +188,7 @@ class PubDateSnifferTest {
|
||||
<!doctype html>
|
||||
<html>
|
||||
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2016-12-27", ret.dateIso8601());
|
||||
@@ -202,7 +202,7 @@ class PubDateSnifferTest {
|
||||
<!doctype html>
|
||||
<html>
|
||||
<title>No date in the HTML</title>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertNull(ret.dateIso8601());
|
||||
@@ -217,7 +217,7 @@ class PubDateSnifferTest {
|
||||
<!doctype html>
|
||||
<html>
|
||||
<title>No date in the HTML</title>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertEquals("2022-02-03", ret.dateIso8601());
|
||||
@@ -232,7 +232,7 @@ class PubDateSnifferTest {
|
||||
<!doctype html>
|
||||
<html>
|
||||
<p>Published 2003, updated 2022</p>
|
||||
"""), HtmlStandard.HTML5, true);
|
||||
"""), DocumentFormat.HTML5, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertNull(ret.dateIso8601());
|
||||
@@ -258,7 +258,7 @@ class PubDateSnifferTest {
|
||||
<!doctype html>
|
||||
<html>
|
||||
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
||||
"""), HtmlStandard.UNKNOWN, true);
|
||||
"""), DocumentFormat.UNKNOWN, true);
|
||||
|
||||
assertFalse(ret.isEmpty());
|
||||
assertNull(ret.dateIso8601());
|
||||
|
@@ -58,6 +58,7 @@ dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
@@ -67,8 +68,6 @@ dependencies {
|
||||
testImplementation libs.mockito
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
||||
|
@@ -25,9 +25,12 @@ import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.process.control.ProcessEventLog;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -54,6 +57,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final ProcessHeartbeatImpl heartbeat;
|
||||
private final ProcessEventLog eventLog;
|
||||
private final DomainProber domainProber;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
@@ -61,6 +65,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
private final HikariDataSource dataSource;
|
||||
private final DomainBlacklist blacklist;
|
||||
private final int node;
|
||||
private final ServiceRegistryIf serviceRegistry;
|
||||
private final SimpleBlockingThreadPool pool;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
@@ -84,6 +89,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
public CrawlerMain(UserAgent userAgent,
|
||||
HttpFetcherImpl httpFetcher,
|
||||
ProcessHeartbeatImpl heartbeat,
|
||||
ProcessEventLog eventLog,
|
||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||
FileStorageService fileStorageService,
|
||||
ProcessConfiguration processConfiguration,
|
||||
@@ -91,6 +97,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
WarcArchiverFactory warcArchiverFactory,
|
||||
HikariDataSource dataSource,
|
||||
DomainBlacklist blacklist,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
Gson gson) throws InterruptedException {
|
||||
|
||||
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
||||
@@ -98,6 +105,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.userAgent = userAgent;
|
||||
this.fetcher = httpFetcher;
|
||||
this.heartbeat = heartbeat;
|
||||
this.eventLog = eventLog;
|
||||
this.domainProber = domainProber;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
@@ -105,6 +113,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.dataSource = dataSource;
|
||||
this.blacklist = blacklist;
|
||||
this.node = processConfiguration.node();
|
||||
this.serviceRegistry = serviceRegistry;
|
||||
|
||||
SimpleBlockingThreadPool.ThreadType threadType;
|
||||
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||
@@ -147,12 +156,17 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Injector injector = Guice.createInjector(
|
||||
new CrawlerModule(),
|
||||
new ProcessConfigurationModule("crawler"),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
var crawler = injector.getInstance(CrawlerMain.class);
|
||||
|
||||
var instructions = crawler.fetchInstructions(nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
||||
|
||||
crawler.serviceRegistry.registerProcess("crawler", crawler.node);
|
||||
|
||||
try {
|
||||
crawler.eventLog.logEvent("CRAWLER-INFO", "Crawling started");
|
||||
var req = instructions.value();
|
||||
if (req.targetDomainName != null) {
|
||||
crawler.runForSingleDomain(req.targetDomainName, req.crawlStorage);
|
||||
@@ -160,11 +174,15 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
else {
|
||||
crawler.runForDatabaseDomains(req.crawlStorage);
|
||||
}
|
||||
crawler.eventLog.logEvent("CRAWLER-INFO", "Crawl completed successfully");
|
||||
instructions.ok();
|
||||
} catch (Exception ex) {
|
||||
logger.error("Crawler failed", ex);
|
||||
instructions.err();
|
||||
}
|
||||
finally {
|
||||
crawler.serviceRegistry.deregisterProcess("crawler", crawler.node);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(5);
|
||||
}
|
||||
@@ -448,13 +466,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
if (retryQueue.remainingCapacity() > 0) {
|
||||
// Sleep a moment to avoid busy looping via the retry queue
|
||||
// in the case when few tasks remain and almost all are ineligible for
|
||||
// immediate restart
|
||||
Thread.sleep(5);
|
||||
}
|
||||
|
||||
pendingCrawlTasks.remove(domain);
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
|
@@ -36,6 +36,7 @@ import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.pool.PoolStats;
|
||||
import org.apache.hc.core5.ssl.SSLContextBuilder;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -48,10 +49,15 @@ import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLException;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
@@ -86,18 +92,49 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
return connectionManager.getTotalStats();
|
||||
}
|
||||
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
// No-op up front validation of server certificates.
|
||||
//
|
||||
// We will validate certificates later, after the connection is established
|
||||
// as we want to store the certificate chain and validation
|
||||
// outcome to the database.
|
||||
|
||||
var trustMeBro = new X509TrustManager() {
|
||||
private X509Certificate[] lastServerCertChain;
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(X509Certificate[] chain, String authType) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(X509Certificate[] chain, String authType) {
|
||||
this.lastServerCertChain = chain.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return new X509Certificate[0];
|
||||
}
|
||||
|
||||
public X509Certificate[] getLastServerCertChain() {
|
||||
return lastServerCertChain != null ? lastServerCertChain.clone() : null;
|
||||
}
|
||||
};
|
||||
|
||||
SSLContext sslContext = SSLContextBuilder.create().build();
|
||||
sslContext.init(null, new TrustManager[]{trustMeBro}, null);
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(sslContext))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
@@ -182,6 +219,8 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (KeyManagementException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent.uaString();
|
||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||
@@ -192,6 +231,8 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (KeyManagementException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent;
|
||||
this.userAgentIdentifier = userAgent;
|
||||
@@ -635,14 +676,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
|
||||
return false;
|
||||
}
|
||||
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
|
||||
return false;
|
||||
}
|
||||
|
||||
return executionCount <= 3;
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -57,6 +57,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
return new ErrorBuffer();
|
||||
}
|
||||
|
||||
Instant start = Instant.now();
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = entity.getContent();
|
||||
@@ -71,8 +72,25 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
finally {
|
||||
// We're required to consume the stream to avoid leaking connections,
|
||||
// but we also don't want to get stuck on slow or malicious connections
|
||||
// forever, so we set a time limit on this phase and call abort() if it's exceeded.
|
||||
try {
|
||||
is.skip(Long.MAX_VALUE);
|
||||
while (is != null) {
|
||||
// Consume some data
|
||||
if (is.skip(65536) == 0) {
|
||||
// Note that skip may return 0 if the stream is empty
|
||||
// or for other unspecified reasons, so we need to check
|
||||
// with read() as well to determine if the stream is done
|
||||
if (is.read() == -1)
|
||||
is = null;
|
||||
}
|
||||
// Check if the time limit has been exceeded
|
||||
else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
|
||||
request.abort();
|
||||
is = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Ignore the exception
|
||||
|
@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), size);
|
||||
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
|
||||
|
||||
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
|
||||
|
||||
|
||||
|
||||
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
||||
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
for (var header : headers) {
|
||||
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
|
||||
if (headerCapitalized.equals("Content-Encoding"))
|
||||
continue;
|
||||
|
||||
|
||||
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||
// to reflect the actual size of the response body. We'll do this at the end.
|
||||
if (headerCapitalized.equals("Content-Length"))
|
||||
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
|
||||
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||
}
|
||||
|
||||
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
|
||||
joiner.add("Content-Length: " + responseSize);
|
||||
|
||||
return joiner.toString();
|
||||
|
@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
Instant date = Instant.now();
|
||||
Instant requestDate = Instant.now();
|
||||
|
||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||
InputStream inputStream = inputBuffer.read()) {
|
||||
|
||||
Instant responseDate = Instant.now();
|
||||
|
||||
cookies.updateCookieStore(response);
|
||||
|
||||
// Build and write the request
|
||||
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||
.blockDigest(requestDigestBuilder.build())
|
||||
.date(date)
|
||||
.date(requestDate)
|
||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||
.build();
|
||||
|
||||
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
response.addHeader("X-Has-Cookies", 1);
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
|
||||
Duration.between(requestDate, responseDate),
|
||||
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
|
||||
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.date(responseDate)
|
||||
.concurrentTo(warcRequest.id())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcResponse);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||
requestUri,
|
||||
Duration.between(date, Instant.now()).getSeconds(),
|
||||
Duration.between(requestDate, Instant.now()).getSeconds(),
|
||||
inputBuffer.size()
|
||||
);
|
||||
|
||||
|
@@ -74,7 +74,7 @@ public class CrawlerRevisitor {
|
||||
|
||||
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
|
||||
// unlikely to produce anything meaningful for us.
|
||||
if (doc.httpStatus != 200)
|
||||
if (doc.httpStatus != 200 && doc.httpStatus != 206)
|
||||
continue;
|
||||
if (!doc.hasBody())
|
||||
continue;
|
||||
|
@@ -58,7 +58,7 @@ public record DocumentWithReference(
|
||||
if (null == doc)
|
||||
return ContentTags.empty();
|
||||
|
||||
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
|
||||
if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
|
||||
return ContentTags.empty();
|
||||
|
||||
String lastmod = doc.getLastModified();
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user