mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
205 Commits
deploy-009
...
deploy-020
Author | SHA1 | Date | |
---|---|---|---|
|
abe9da0fc6 | ||
|
56d0128b0a | ||
|
840b68ac55 | ||
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
be039d1a8c | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 | ||
|
599534806b | ||
|
7e8253dac7 | ||
|
97a6780ea3 | ||
|
eb634beec8 | ||
|
269ebd1654 | ||
|
39ce40bfeb | ||
|
c187b2e1c1 | ||
|
42eaa4588b | ||
|
4f40a5fbeb | ||
|
3f3d42bc01 | ||
|
61c8d53e1b | ||
|
a7a3d85be9 | ||
|
306232fb54 | ||
|
5aef844f0d | ||
|
d56b5c828a | ||
|
ab58a4636f | ||
|
00be269238 | ||
|
879e6a9424 | ||
|
fba3455732 | ||
|
14283da7f5 | ||
|
93df4d1fc0 | ||
|
b12a0b998c | ||
|
3b6f4e321b | ||
|
8428111771 | ||
|
e9fd4415ef | ||
|
4c95c3dcad | ||
|
c5281536fb | ||
|
4431dae7ac | ||
|
4df4d0a7a8 | ||
|
9f05083b94 | ||
|
fc92e9b9c0 | ||
|
328fb5d927 | ||
|
36889950e8 | ||
|
c96a94878b | ||
|
1c57d7d73a | ||
|
a443d22356 | ||
|
aa59d4afa4 | ||
|
df0f18d0e7 | ||
|
0819d46f97 | ||
|
5e2b63473e | ||
|
f9590703f1 | ||
|
f12fc11337 | ||
|
c309030184 | ||
|
fd5af01629 | ||
|
d4c43c7a79 | ||
|
18700e1919 | ||
|
120b431998 | ||
|
71dad99326 | ||
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 | ||
|
413d5cc788 | ||
|
58539b92ac | ||
|
fe72f16df1 | ||
|
b49a244a2e | ||
|
3f0b4c010f | ||
|
c6e0cd93f7 | ||
|
80a7ccb080 | ||
|
54dec347c4 | ||
|
d6ee3f0785 | ||
|
8be88afcf3 | ||
|
0e3c00d3e1 | ||
|
4279a7f1aa | ||
|
251006d4f9 | ||
|
c3e99dc12a | ||
|
aaaa2de022 | ||
|
fc1388422a | ||
|
b07080db16 | ||
|
e9d86dca4a | ||
|
1d693f0efa | ||
|
5874a163dc | ||
|
5ec7a1deab | ||
|
7fea2808ed | ||
|
8da74484f0 | ||
|
923d5a7234 | ||
|
58f88749b8 | ||
|
77f727a5ba | ||
|
667cfb53dc | ||
|
fe36d4ed20 | ||
|
acf4bef98d | ||
|
2a737c34bb | ||
|
90a577af82 | ||
|
f0c9b935d8 | ||
|
7b5493dd51 | ||
|
c246a59158 | ||
|
0b99781d24 | ||
|
39db9620c1 | ||
|
1781599363 | ||
|
6b2d18fb9b | ||
|
59b1d200ab | ||
|
897010a2cf | ||
|
602af7a77e | ||
|
a7d91c8527 | ||
|
7151602124 | ||
|
884e33bd4a | ||
|
e84d5c497a | ||
|
2d2d3e2466 | ||
|
647dd9b12f | ||
|
de4e2849ce | ||
|
3c43f1954e | ||
|
fa2462ec39 | ||
|
f4ad7145db | ||
|
068b450180 | ||
|
05b909a21f | ||
|
3d179cddce | ||
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce | ||
|
500f63e921 | ||
|
6dfbedda1e | ||
|
9715ddb105 | ||
|
1fc6313a77 | ||
|
b1249d5b8a | ||
|
ef95d59b07 | ||
|
acdd8664f5 | ||
|
6b12eac58a | ||
|
bb3f1f395a | ||
|
b661beef41 | ||
|
9888c47f19 | ||
|
dcef7e955b | ||
|
b3973a1dd7 | ||
|
8bd05d6d90 | ||
|
59df8e356e | ||
|
7161162a35 | ||
|
d7c4c5141f | ||
|
88e9b8fb05 | ||
|
b6265cee11 | ||
|
c91af247e9 | ||
|
7a31227de1 | ||
|
4f477604c5 | ||
|
2970f4395b | ||
|
d1ec909b36 | ||
|
c67c5bbf42 | ||
|
ecb0e57a1a | ||
|
8c61f61b46 | ||
|
662a18c933 | ||
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 | ||
|
0f3b24d0f8 | ||
|
a732095d2a | ||
|
6607f0112f | ||
|
4913730de9 | ||
|
1db64f9d56 | ||
|
4dcff14498 | ||
|
426658f64e | ||
|
2181b22f05 | ||
|
42bd79a609 | ||
|
b91c1e528a | ||
|
b1130d7a04 | ||
|
8364bcdc97 | ||
|
626cab5fab | ||
|
cfd4712191 | ||
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca | ||
|
78cc25584a | ||
|
62ba30bacf | ||
|
3bb84eb206 |
@@ -5,7 +5,7 @@ plugins {
|
|||||||
|
|
||||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
|
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
group 'marginalia'
|
group 'marginalia'
|
||||||
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
|
|||||||
}
|
}
|
||||||
|
|
||||||
ext {
|
ext {
|
||||||
jvmVersion=23
|
jvmVersion = 24
|
||||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||||
dockerImageTag='latest'
|
dockerImageTag='latest'
|
||||||
dockerImageRegistry='marginalia'
|
dockerImageRegistry='marginalia'
|
||||||
jibVersion = '3.4.4'
|
jibVersion = '3.4.5'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
idea {
|
idea {
|
||||||
|
@@ -22,6 +22,7 @@ public class DbDomainQueries {
|
|||||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||||
|
|
||||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
|
||||||
@@ -59,6 +60,34 @@ public class DbDomainQueries {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
|
||||||
|
try {
|
||||||
|
return domainWithNodeCache.get(domain, () -> {
|
||||||
|
try (var connection = dataSource.getConnection();
|
||||||
|
var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
|
|
||||||
|
stmt.setString(1, domain.toString());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
catch (UncheckedExecutionException ex) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
catch (ExecutionException ex) {
|
||||||
|
throw new RuntimeException(ex.getCause());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||||
|
|
||||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||||
@@ -145,4 +174,6 @@ public class DbDomainQueries {
|
|||||||
return nodeAffinity > 0;
|
return nodeAffinity > 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record DomainIdWithNode (int domainId, int nodeAffinity) { }
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.model;
|
||||||
|
|
||||||
|
public enum DocumentFormat {
|
||||||
|
PLAIN(0, 1, "text"),
|
||||||
|
PDF(0, 1, "pdf"),
|
||||||
|
UNKNOWN(0, 1, "???"),
|
||||||
|
HTML123(0, 1, "html"),
|
||||||
|
HTML4(-0.1, 1.05, "html"),
|
||||||
|
XHTML(-0.1, 1.05, "html"),
|
||||||
|
HTML5(0.5, 1.1, "html");
|
||||||
|
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double offset;
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double scale;
|
||||||
|
public final String shortFormat;
|
||||||
|
|
||||||
|
DocumentFormat(double offset, double scale, String shortFormat) {
|
||||||
|
this.offset = offset;
|
||||||
|
this.scale = scale;
|
||||||
|
this.shortFormat = shortFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
|
|||||||
@Nonnull
|
@Nonnull
|
||||||
public final String topDomain;
|
public final String topDomain;
|
||||||
|
|
||||||
public EdgeDomain(String host) {
|
public EdgeDomain(@Nonnull String host) {
|
||||||
Objects.requireNonNull(host, "domain name must not be null");
|
Objects.requireNonNull(host, "domain name must not be null");
|
||||||
|
|
||||||
host = host.toLowerCase();
|
host = host.toLowerCase();
|
||||||
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
|
|||||||
this.topDomain = topDomain;
|
this.topDomain = topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getTopDomain(String host) {
|
||||||
|
return new EdgeDomain(host).topDomain;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean looksLikeGovTld(String host) {
|
private boolean looksLikeGovTld(String host) {
|
||||||
if (host.length() < 8)
|
if (host.length() < 8)
|
||||||
return false;
|
return false;
|
||||||
@@ -108,32 +112,6 @@ public class EdgeDomain implements Serializable {
|
|||||||
return topDomain;
|
return topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDomainKey() {
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
return topDomain;
|
|
||||||
}
|
|
||||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLongDomainKey() {
|
|
||||||
StringBuilder ret = new StringBuilder();
|
|
||||||
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
ret.append(topDomain);
|
|
||||||
} else {
|
|
||||||
ret.append(topDomain, 0, cutPoint);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
|
|
||||||
ret.append(":");
|
|
||||||
ret.append(subDomain);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret.toString().toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** If possible, try to provide an alias domain,
|
/** If possible, try to provide an alias domain,
|
||||||
* i.e. a domain name that is very likely to link to this one
|
* i.e. a domain name that is very likely to link to this one
|
||||||
* */
|
* */
|
||||||
|
@@ -1,16 +1,14 @@
|
|||||||
package nu.marginalia.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
import nu.marginalia.util.QueryParams;
|
import nu.marginalia.util.QueryParams;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.net.MalformedURLException;
|
import java.net.*;
|
||||||
import java.net.URI;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
public class EdgeUrl implements Serializable {
|
public class EdgeUrl implements Serializable {
|
||||||
public final String proto;
|
public final String proto;
|
||||||
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {
|
|||||||
|
|
||||||
private static URI parseURI(String url) throws URISyntaxException {
|
private static URI parseURI(String url) throws URISyntaxException {
|
||||||
try {
|
try {
|
||||||
return new URI(urlencodeFixer(url));
|
return EdgeUriFactory.parseURILenient(url);
|
||||||
} catch (URISyntaxException ex) {
|
} catch (URISyntaxException ex) {
|
||||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||||
}
|
}
|
||||||
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
|
||||||
|
|
||||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
|
||||||
|
|
||||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
|
||||||
and what you get is more like what's on the inside, we try to patch things instead,
|
|
||||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
|
||||||
like bad or missing URLEncoding
|
|
||||||
*/
|
|
||||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
|
||||||
var s = new StringBuilder();
|
|
||||||
String goodChars = "&.?:/-;+$#";
|
|
||||||
String hexChars = "0123456789abcdefABCDEF";
|
|
||||||
|
|
||||||
int pathIdx = findPathIdx(url);
|
|
||||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
|
||||||
return url + "/";
|
|
||||||
}
|
|
||||||
s.append(url, 0, pathIdx);
|
|
||||||
|
|
||||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
|
||||||
int end = url.indexOf("#");
|
|
||||||
if (end < 0) end = url.length();
|
|
||||||
|
|
||||||
for (int i = pathIdx; i < end; i++) {
|
|
||||||
int c = url.charAt(i);
|
|
||||||
|
|
||||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
|
||||||
s.appendCodePoint(c);
|
|
||||||
} else if (c == '%' && i + 2 < end) {
|
|
||||||
int cn = url.charAt(i + 1);
|
|
||||||
int cnn = url.charAt(i + 2);
|
|
||||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
|
||||||
s.appendCodePoint(c);
|
|
||||||
} else {
|
|
||||||
s.append("%25");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
s.append(String.format("%%%02X", c));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return s.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int findPathIdx(String url) throws URISyntaxException {
|
|
||||||
int colonIdx = url.indexOf(':');
|
|
||||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
|
||||||
throw new URISyntaxException(url, "Lacking protocol");
|
|
||||||
}
|
|
||||||
return url.indexOf('/', colonIdx + 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
public EdgeUrl(URI URI) {
|
public EdgeUrl(URI URI) {
|
||||||
try {
|
try {
|
||||||
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
|
|||||||
sb.append(port);
|
sb.append(port);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EdgeUriFactory.urlencodePath(sb, path);
|
||||||
|
|
||||||
|
if (param != null) {
|
||||||
|
EdgeUriFactory.urlencodeQuery(sb, param);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toDisplayString() {
|
||||||
|
StringBuilder sb = new StringBuilder(256);
|
||||||
|
|
||||||
|
sb.append(proto);
|
||||||
|
sb.append("://");
|
||||||
|
sb.append(domain);
|
||||||
|
|
||||||
|
if (port != null) {
|
||||||
|
sb.append(':');
|
||||||
|
sb.append(port);
|
||||||
|
}
|
||||||
|
|
||||||
sb.append(path);
|
sb.append(path);
|
||||||
|
|
||||||
if (param != null) {
|
if (param != null) {
|
||||||
sb.append('?');
|
sb.append('?').append(param);
|
||||||
sb.append(param);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class EdgeUriFactory {
|
||||||
|
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||||
|
|
||||||
|
if (shouldOmitUrlencodeRepair(url)) {
|
||||||
|
try {
|
||||||
|
return new URI(url);
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex) {
|
||||||
|
// ignore and run the lenient parser
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var s = new StringBuilder(url.length()+8);
|
||||||
|
|
||||||
|
int pathIdx = findPathIdx(url);
|
||||||
|
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||||
|
return new URI(url + "/");
|
||||||
|
}
|
||||||
|
s.append(url, 0, pathIdx);
|
||||||
|
|
||||||
|
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||||
|
int end = url.indexOf("#");
|
||||||
|
if (end < 0) end = url.length();
|
||||||
|
|
||||||
|
int queryIdx = url.indexOf('?');
|
||||||
|
if (queryIdx < 0) queryIdx = end;
|
||||||
|
|
||||||
|
urlencodePath(s, url.substring(pathIdx, queryIdx));
|
||||||
|
if (queryIdx < end) {
|
||||||
|
urlencodeQuery(s, url.substring(queryIdx + 1, end));
|
||||||
|
}
|
||||||
|
return new URI(s.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Break apart the path element of an URI into its components, and then
|
||||||
|
* urlencode any component that needs it, and recombine it into a single
|
||||||
|
* path element again.
|
||||||
|
*/
|
||||||
|
public static void urlencodePath(StringBuilder sb, String path) {
|
||||||
|
if (path == null || path.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] pathParts = StringUtils.split(path, '/');
|
||||||
|
if (pathParts.length == 0) {
|
||||||
|
sb.append('/');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean shouldUrlEncode = false;
|
||||||
|
for (String pathPart : pathParts) {
|
||||||
|
if (pathPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (needsUrlEncode(pathPart)) {
|
||||||
|
shouldUrlEncode = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String pathPart : pathParts) {
|
||||||
|
if (pathPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (shouldUrlEncode) {
|
||||||
|
sb.append('/');
|
||||||
|
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||||
|
} else {
|
||||||
|
sb.append('/');
|
||||||
|
sb.append(pathPart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.endsWith("/")) {
|
||||||
|
sb.append('/');
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Break apart the query element of a URI into its components, and then
|
||||||
|
* urlencode any component that needs it, and recombine it into a single
|
||||||
|
* query element again.
|
||||||
|
*/
|
||||||
|
public static void urlencodeQuery(StringBuilder sb, String param) {
|
||||||
|
if (param == null || param.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] queryParts = StringUtils.split(param, '&');
|
||||||
|
|
||||||
|
boolean shouldUrlEncode = false;
|
||||||
|
for (String queryPart : queryParts) {
|
||||||
|
if (queryPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (needsUrlEncode(queryPart)) {
|
||||||
|
shouldUrlEncode = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean first = true;
|
||||||
|
for (String queryPart : queryParts) {
|
||||||
|
if (queryPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (first) {
|
||||||
|
sb.append('?');
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
sb.append('&');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shouldUrlEncode) {
|
||||||
|
int idx = queryPart.indexOf('=');
|
||||||
|
if (idx < 0) {
|
||||||
|
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||||
|
} else {
|
||||||
|
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
|
||||||
|
sb.append('=');
|
||||||
|
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sb.append(queryPart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test if the url element needs URL encoding.
|
||||||
|
* <p></p>
|
||||||
|
* Note we may have been given an already encoded path element,
|
||||||
|
* so we include % and + in the list of good characters
|
||||||
|
*/
|
||||||
|
static boolean needsUrlEncode(String urlElement) {
|
||||||
|
for (int i = 0; i < urlElement.length(); i++) {
|
||||||
|
char c = urlElement.charAt(i);
|
||||||
|
|
||||||
|
if (isUrlSafe(c)) continue;
|
||||||
|
if ("+".indexOf(c) >= 0) continue;
|
||||||
|
if (c == '%' && i + 2 < urlElement.length()) {
|
||||||
|
char c1 = urlElement.charAt(i + 1);
|
||||||
|
char c2 = urlElement.charAt(i + 2);
|
||||||
|
if (isHexDigit(c1) && isHexDigit(c2)) {
|
||||||
|
i += 2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static boolean isUrlSafe(int c) {
|
||||||
|
if (c >= 'a' && c <= 'z') return true;
|
||||||
|
if (c >= 'A' && c <= 'Z') return true;
|
||||||
|
if (c >= '0' && c <= '9') return true;
|
||||||
|
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test if the URL is a valid URL that does not need to be
|
||||||
|
* urlencoded.
|
||||||
|
* <p></p>
|
||||||
|
* This is a very simple heuristic test that does not guarantee
|
||||||
|
* that the URL is valid, but it will identify cases where we
|
||||||
|
* are fairly certain that the URL does not need encoding,
|
||||||
|
* so we can skip a bunch of allocations and string operations
|
||||||
|
* that would otherwise be needed to fix the URL.
|
||||||
|
*/
|
||||||
|
static boolean shouldOmitUrlencodeRepair(String url) {
|
||||||
|
int idx = 0;
|
||||||
|
final int len = url.length();
|
||||||
|
|
||||||
|
// Validate the scheme
|
||||||
|
while (idx < len - 2) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == ':') break;
|
||||||
|
if (!isAsciiAlphabetic(c)) return false;
|
||||||
|
}
|
||||||
|
if (url.charAt(idx++) != '/') return false;
|
||||||
|
if (url.charAt(idx++) != '/') return false;
|
||||||
|
|
||||||
|
// Validate the authority
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '/') break;
|
||||||
|
if (c == ':') continue;
|
||||||
|
if (c == '@') continue;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate the path
|
||||||
|
if (idx >= len) return true;
|
||||||
|
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '?') break;
|
||||||
|
if (c == '/') continue;
|
||||||
|
if (c == '#') return true;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx >= len) return true;
|
||||||
|
|
||||||
|
// Validate the query
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '&') continue;
|
||||||
|
if (c == '=') continue;
|
||||||
|
if (c == '#') return true;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isAsciiAlphabetic(int c) {
|
||||||
|
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isHexDigit(int c) {
|
||||||
|
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Find the index of the path element in a URL.
|
||||||
|
* <p></p>
|
||||||
|
* The path element starts after the scheme and authority part of the URL,
|
||||||
|
* which is everything up to and including the first slash after the colon.
|
||||||
|
*/
|
||||||
|
private static int findPathIdx(String url) throws URISyntaxException {
|
||||||
|
int colonIdx = url.indexOf(':');
|
||||||
|
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
||||||
|
throw new URISyntaxException(url, "Lacking scheme");
|
||||||
|
}
|
||||||
|
return url.indexOf('/', colonIdx + 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -28,6 +28,8 @@ public enum HtmlFeature {
|
|||||||
|
|
||||||
GA_SPAM("special:gaspam"),
|
GA_SPAM("special:gaspam"),
|
||||||
|
|
||||||
|
PDF("format:pdf"),
|
||||||
|
|
||||||
/** For fingerprinting and ranking */
|
/** For fingerprinting and ranking */
|
||||||
OPENGRAPH("special:opengraph"),
|
OPENGRAPH("special:opengraph"),
|
||||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||||
|
@@ -1,22 +0,0 @@
|
|||||||
package nu.marginalia.model.html;
|
|
||||||
|
|
||||||
// This class really doesn't belong anywhere, but will squat here for now
|
|
||||||
public enum HtmlStandard {
|
|
||||||
PLAIN(0, 1),
|
|
||||||
UNKNOWN(0, 1),
|
|
||||||
HTML123(0, 1),
|
|
||||||
HTML4(-0.1, 1.05),
|
|
||||||
XHTML(-0.1, 1.05),
|
|
||||||
HTML5(0.5, 1.1);
|
|
||||||
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double offset;
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double scale;
|
|
||||||
|
|
||||||
HtmlStandard(double offset, double scale) {
|
|
||||||
this.offset = offset;
|
|
||||||
this.scale = scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@@ -9,7 +9,7 @@ public enum DocumentFlags {
|
|||||||
GeneratorForum,
|
GeneratorForum,
|
||||||
GeneratorWiki,
|
GeneratorWiki,
|
||||||
Sideloaded,
|
Sideloaded,
|
||||||
Unused7,
|
PdfFile,
|
||||||
Unused8,
|
Unused8,
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||||||
|
|
||||||
class EdgeDomainTest {
|
class EdgeDomainTest {
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSkepdic() throws URISyntaxException {
|
|
||||||
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
|
|
||||||
assertEquals("skepdic", domain.getDomain().getDomainKey());
|
|
||||||
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
|
|
||||||
assertEquals("skepdic", domain2.getDomain().getDomainKey());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHkDomain() throws URISyntaxException {
|
public void testHkDomain() throws URISyntaxException {
|
||||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
@@ -21,25 +21,70 @@ class EdgeUrlTest {
|
|||||||
new EdgeUrl("https://memex.marginalia.nu/#here")
|
new EdgeUrl("https://memex.marginalia.nu/#here")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testParam() throws URISyntaxException {
|
void testUriFromString() throws URISyntaxException {
|
||||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
|
||||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
// converting it back to a string, we want to ensure there is no changes along the way.
|
||||||
}
|
|
||||||
@Test
|
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
|
||||||
void urlencodeFixer() throws URISyntaxException {
|
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||||
|
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||||
|
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testParms() throws URISyntaxException {
|
void testParms() throws URISyntaxException {
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
|
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
|
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
|
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
|
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
|
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
|
||||||
|
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
|
||||||
|
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||||
|
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
|
||||||
|
|
||||||
|
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
|
||||||
|
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void progress(String step, int stepProgress, int stepCount) {
|
public void progress(String step, int stepProgress, int stepCount) {
|
||||||
|
int lastProgress = this.progress;
|
||||||
this.step = step;
|
this.step = step;
|
||||||
|
|
||||||
|
|
||||||
// off by one since we calculate the progress based on the number of steps,
|
|
||||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
|
||||||
// final progress being 80% and not 100%)
|
|
||||||
|
|
||||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||||
|
|
||||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
if (this.progress / 10 != lastProgress / 10) {
|
||||||
|
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||||
|
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void progress(String step, int stepProgress, int stepCount) {
|
public void progress(String step, int stepProgress, int stepCount) {
|
||||||
|
int lastProgress = this.progress;
|
||||||
this.step = step;
|
this.step = step;
|
||||||
|
|
||||||
|
|
||||||
// off by one since we calculate the progress based on the number of steps,
|
|
||||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
|
||||||
// final progress being 80% and not 100%)
|
|
||||||
|
|
||||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||||
|
|
||||||
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
|
if (this.progress / 10 != lastProgress / 10) {
|
||||||
|
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void shutDown() {
|
public void shutDown() {
|
||||||
|
@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
|
|
||||||
while (nets.hasMoreElements()) {
|
while (nets.hasMoreElements()) {
|
||||||
NetworkInterface netif = nets.nextElement();
|
NetworkInterface netif = nets.nextElement();
|
||||||
|
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
|
||||||
if (!netif.isUp() || netif.isLoopback()) {
|
if (!netif.isUp() || netif.isLoopback()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||||
while (inetAddresses.hasMoreElements()) {
|
while (inetAddresses.hasMoreElements()) {
|
||||||
InetAddress addr = inetAddresses.nextElement();
|
InetAddress addr = inetAddresses.nextElement();
|
||||||
|
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
|
||||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||||
return addr.getHostAddress();
|
return addr.getHostAddress();
|
||||||
}
|
}
|
||||||
|
@@ -122,6 +122,11 @@ public class JoobyService {
|
|||||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||||
options.setCompressionLevel(1);
|
options.setCompressionLevel(1);
|
||||||
|
|
||||||
|
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||||
|
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||||
|
// scenario
|
||||||
|
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||||
|
|
||||||
|
|
||||||
jooby.setServerOptions(options);
|
jooby.setServerOptions(options);
|
||||||
|
|
||||||
|
@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;
|
|||||||
|
|
||||||
public class MetricsServer {
|
public class MetricsServer {
|
||||||
|
|
||||||
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public MetricsServer(ServiceConfiguration configuration) {
|
public MetricsServer(ServiceConfiguration configuration) {
|
||||||
@@ -30,6 +30,8 @@ public class MetricsServer {
|
|||||||
|
|
||||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||||
|
|
||||||
|
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||||
|
|
||||||
server.start();
|
server.start();
|
||||||
}
|
}
|
||||||
catch (Exception|NoSuchMethodError ex) {
|
catch (Exception|NoSuchMethodError ex) {
|
||||||
|
@@ -35,21 +35,8 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forExpensiveRequest() {
|
|
||||||
return new RateLimiter(5, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter custom(int perMinute) {
|
public static RateLimiter custom(int perMinute) {
|
||||||
return new RateLimiter(perMinute, 60);
|
return new RateLimiter(4 * perMinute, perMinute);
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter forSpamBots() {
|
|
||||||
return new RateLimiter(120, 3600);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forLogin() {
|
|
||||||
return new RateLimiter(3, 15);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void cleanIdleBuckets() {
|
private void cleanIdleBuckets() {
|
||||||
@@ -62,7 +49,7 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Bucket createBucket() {
|
private Bucket createBucket() {
|
||||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
|
||||||
var bw = Bandwidth.classic(capacity, refill);
|
var bw = Bandwidth.classic(capacity, refill);
|
||||||
return Bucket.builder().addLimit(bw).build();
|
return Bucket.builder().addLimit(bw).build();
|
||||||
}
|
}
|
||||||
|
@@ -3,8 +3,16 @@
|
|||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="Console" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||||
<Filters>
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -13,15 +21,29 @@
|
|||||||
<Filters>
|
<Filters>
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
<SizeBasedTriggeringPolicy size="10MB" />
|
<SizeBasedTriggeringPolicy size="10MB" />
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="Console"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="LogToFile"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
|
@@ -1,10 +1,49 @@
|
|||||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||||
<Appenders>
|
<Appenders>
|
||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
<Filters>
|
<Filters>
|
||||||
|
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -17,14 +56,30 @@
|
|||||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
<AppenderRef ref="ConsoleError"/>
|
||||||
|
<AppenderRef ref="ConsoleFatal"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="LogToFile"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
|
@@ -1,15 +1,50 @@
|
|||||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||||
<Appenders>
|
<Appenders>
|
||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
<AppenderRef ref="ConsoleError"/>
|
||||||
|
<AppenderRef ref="ConsoleFatal"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
</Configuration>
|
</Configuration>
|
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
|
|||||||
class ZkServiceRegistryTest {
|
class ZkServiceRegistryTest {
|
||||||
private static final int ZOOKEEPER_PORT = 2181;
|
private static final int ZOOKEEPER_PORT = 2181;
|
||||||
private static final GenericContainer<?> zookeeper =
|
private static final GenericContainer<?> zookeeper =
|
||||||
new GenericContainer<>("zookeeper:3.8.0")
|
new GenericContainer<>("zookeeper:3.8")
|
||||||
.withExposedPorts(ZOOKEEPER_PORT);
|
.withExposedPorts(ZOOKEEPER_PORT);
|
||||||
|
|
||||||
List<ZkServiceRegistry> registries = new ArrayList<>();
|
List<ZkServiceRegistry> registries = new ArrayList<>();
|
||||||
|
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
|
|||||||
return msgId;
|
return msgId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
|
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
|
||||||
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
||||||
.forNode(node)
|
.forNode(node)
|
||||||
.run(RpcExportSampleData.newBuilder()
|
.run(RpcExportSampleData.newBuilder()
|
||||||
.setFileStorageId(fid.id())
|
.setFileStorageId(fid.id())
|
||||||
.setSize(size)
|
.setSize(size)
|
||||||
|
.setCtFilter(ctFilter)
|
||||||
.setName(name)
|
.setName(name)
|
||||||
.build());
|
.build());
|
||||||
}
|
}
|
||||||
|
@@ -100,6 +100,7 @@ message RpcExportSampleData {
|
|||||||
int64 fileStorageId = 1;
|
int64 fileStorageId = 1;
|
||||||
int32 size = 2;
|
int32 size = 2;
|
||||||
string name = 3;
|
string name = 3;
|
||||||
|
string ctFilter = 4;
|
||||||
}
|
}
|
||||||
message RpcDownloadSampleData {
|
message RpcDownloadSampleData {
|
||||||
string sampleSet = 1;
|
string sampleSet = 1;
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
|||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.actor.state.Resume;
|
import nu.marginalia.actor.state.Resume;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
private final ServiceEventLog eventLog;
|
private final ServiceEventLog eventLog;
|
||||||
|
private final ServiceHeartbeat heartbeat;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Resume(behavior = ActorResumeBehavior.ERROR)
|
@Resume(behavior = ActorResumeBehavior.ERROR)
|
||||||
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
Files.deleteIfExists(Path.of(tarFileName));
|
Files.deleteIfExists(Path.of(tarFileName));
|
||||||
|
|
||||||
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
|
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
|
||||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
|
||||||
is.transferTo(os);
|
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
|
||||||
|
long size = urlConnection.getContentLengthLong();
|
||||||
|
byte[] buffer = new byte[8192];
|
||||||
|
|
||||||
|
try (var is = new BufferedInputStream(urlConnection.getInputStream());
|
||||||
|
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||||
|
long copiedSize = 0;
|
||||||
|
|
||||||
|
while (copiedSize < size) {
|
||||||
|
int read = is.read(buffer);
|
||||||
|
|
||||||
|
if (read < 0) // We've been promised a file of length 'size'
|
||||||
|
throw new IOException("Unexpected end of stream");
|
||||||
|
|
||||||
|
os.write(buffer, 0, read);
|
||||||
|
copiedSize += read;
|
||||||
|
|
||||||
|
// Update progress bar
|
||||||
|
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
||||||
logger.error("Error downloading sample", ex);
|
logger.error("Error downloading sample", ex);
|
||||||
yield new Error();
|
yield new Error();
|
||||||
}
|
}
|
||||||
|
finally {
|
||||||
|
urlConnection.disconnect();
|
||||||
|
}
|
||||||
|
|
||||||
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
||||||
yield new Extract(fileStorageId, tarFileName);
|
yield new Extract(fileStorageId, tarFileName);
|
||||||
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
@Inject
|
@Inject
|
||||||
public DownloadSampleActor(Gson gson,
|
public DownloadSampleActor(Gson gson,
|
||||||
FileStorageService storageService,
|
FileStorageService storageService,
|
||||||
ServiceEventLog eventLog)
|
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
|
||||||
{
|
{
|
||||||
super(gson);
|
super(gson);
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
this.eventLog = eventLog;
|
this.eventLog = eventLog;
|
||||||
|
this.heartbeat = heartbeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
private final MqOutbox exportTasksOutbox;
|
private final MqOutbox exportTasksOutbox;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
|
||||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
|
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
|
||||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
|
||||||
this(crawlId, destId, size, name, -1);
|
this(crawlId, destId, size, name, ctFilter,-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
|
||||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||||
"crawl-sample-export",
|
"crawl-sample-export",
|
||||||
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
||||||
);
|
);
|
||||||
|
|
||||||
if (storage == null) yield new Error("Bad storage id");
|
if (storage == null) yield new Error("Bad storage id");
|
||||||
yield new Run(crawlId, storage.id(), size, name);
|
yield new Run(crawlId, storage.id(), size, ctFilter, name);
|
||||||
}
|
}
|
||||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
|
case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
|
||||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||||
|
|
||||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
|
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
|
||||||
yield new Run(crawlId, destId, size, name, newMsgId);
|
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(_, FileStorageId destId, _, _, long msgId) -> {
|
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String describe() {
|
public String describe() {
|
||||||
return "Export RSS/Atom feeds from crawl data";
|
return "Export sample crawl data";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
|
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
|
|||||||
new ExportSampleDataActor.Export(
|
new ExportSampleDataActor.Export(
|
||||||
FileStorageId.of(request.getFileStorageId()),
|
FileStorageId.of(request.getFileStorageId()),
|
||||||
request.getSize(),
|
request.getSize(),
|
||||||
|
request.getCtFilter(),
|
||||||
request.getName()
|
request.getName()
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
47
code/functions/favicon/api/build.gradle
Normal file
47
code/functions/favicon/api/build.gradle
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id "com.google.protobuf" version "0.9.4"
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
jar.archiveBaseName = 'favicon-api'
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation libs.gson
|
||||||
|
implementation libs.bundles.protobuf
|
||||||
|
implementation libs.guava
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,39 @@
|
|||||||
|
package nu.marginalia.api.favicon;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
|
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||||
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class FaviconClient {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(FaviconClient.class);
|
||||||
|
|
||||||
|
private final GrpcMultiNodeChannelPool<FaviconAPIGrpc.FaviconAPIBlockingStub> channelPool;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FaviconClient(GrpcChannelPoolFactory factory) {
|
||||||
|
this.channelPool = factory.createMulti(
|
||||||
|
ServiceKey.forGrpcApi(FaviconAPIGrpc.class, ServicePartition.multi()),
|
||||||
|
FaviconAPIGrpc::newBlockingStub);
|
||||||
|
}
|
||||||
|
|
||||||
|
public record FaviconData(byte[] bytes, String contentType) {}
|
||||||
|
|
||||||
|
|
||||||
|
public Optional<FaviconData> getFavicon(String domain, int node) {
|
||||||
|
RpcFaviconResponse rsp = channelPool.call(FaviconAPIGrpc.FaviconAPIBlockingStub::getFavicon)
|
||||||
|
.forNode(node)
|
||||||
|
.run(RpcFaviconRequest.newBuilder().setDomain(domain).build());
|
||||||
|
|
||||||
|
if (rsp.getData().isEmpty())
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
return Optional.of(new FaviconData(rsp.getData().toByteArray(), rsp.getContentType()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
syntax="proto3";
|
||||||
|
package marginalia.api.favicon;
|
||||||
|
|
||||||
|
option java_package="nu.marginalia.api.favicon";
|
||||||
|
option java_multiple_files=true;
|
||||||
|
|
||||||
|
service FaviconAPI {
|
||||||
|
/** Fetches information about a domain. */
|
||||||
|
rpc getFavicon(RpcFaviconRequest) returns (RpcFaviconResponse) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFaviconRequest {
|
||||||
|
string domain = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFaviconResponse {
|
||||||
|
string domain = 1;
|
||||||
|
bytes data = 2;
|
||||||
|
string contentType = 3;
|
||||||
|
}
|
49
code/functions/favicon/build.gradle
Normal file
49
code/functions/favicon/build.gradle
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id 'application'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:functions:favicon:api')
|
||||||
|
implementation project(':code:processes:crawling-process')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.guava
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation dependencies.create(libs.spark.get()) {
|
||||||
|
exclude group: 'org.eclipse.jetty'
|
||||||
|
}
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.functions.favicon;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.protobuf.ByteString;
|
||||||
|
import io.grpc.stub.StreamObserver;
|
||||||
|
import nu.marginalia.api.favicon.FaviconAPIGrpc;
|
||||||
|
import nu.marginalia.api.favicon.RpcFaviconRequest;
|
||||||
|
import nu.marginalia.api.favicon.RpcFaviconResponse;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implements DiscoverableService {
|
||||||
|
private final DomainStateDb domainStateDb;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FaviconGrpcService(DomainStateDb domainStateDb) {
|
||||||
|
this.domainStateDb = domainStateDb;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean shouldRegisterService() {
|
||||||
|
return domainStateDb.isAvailable();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
|
||||||
|
Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());
|
||||||
|
|
||||||
|
RpcFaviconResponse response;
|
||||||
|
if (icon.isEmpty()) {
|
||||||
|
response = RpcFaviconResponse.newBuilder().build();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var iconRecord = icon.get();
|
||||||
|
response = RpcFaviconResponse.newBuilder()
|
||||||
|
.setContentType(iconRecord.contentType())
|
||||||
|
.setDomain(request.getDomain())
|
||||||
|
.setData(ByteString.copyFrom(iconRecord.imageData()))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
responseObserver.onNext(response);
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
}
|
@@ -25,9 +25,9 @@ dependencies {
|
|||||||
|
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
|
implementation project(':third-party:rssreader')
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation project(':third-party:rssreader')
|
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.slop
|
implementation libs.slop
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
@@ -57,8 +57,6 @@ dependencies {
|
|||||||
implementation libs.bundles.gson
|
implementation libs.bundles.gson
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
@@ -0,0 +1,126 @@
|
|||||||
|
package nu.marginalia.domsample;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import jakarta.inject.Named;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
|
import nu.marginalia.livecapture.BrowserlessClient;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
public class DomSampleService {
|
||||||
|
private final DomSampleDb db;
|
||||||
|
private final HikariDataSource mariadbDataSource;
|
||||||
|
private final URI browserlessURI;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomSampleService(DomSampleDb db,
|
||||||
|
HikariDataSource mariadbDataSource,
|
||||||
|
@Named("browserless-uri") String browserlessAddress,
|
||||||
|
ServiceConfiguration serviceConfiguration)
|
||||||
|
throws URISyntaxException
|
||||||
|
{
|
||||||
|
this.db = db;
|
||||||
|
this.mariadbDataSource = mariadbDataSource;
|
||||||
|
|
||||||
|
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||||
|
logger.warn("Live capture service will not run");
|
||||||
|
browserlessURI = null;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
browserlessURI = new URI(browserlessAddress);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void start() {
|
||||||
|
if (browserlessURI == null) {
|
||||||
|
logger.warn("DomSampleService is not enabled due to missing browserless URI or multi-node configuration");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon().start(this::run);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void syncDomains() {
|
||||||
|
Set<String> dbDomains = new HashSet<>();
|
||||||
|
|
||||||
|
logger.info("Fetching domains from database...");
|
||||||
|
|
||||||
|
try (var conn = mariadbDataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_NAME
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE NODE_AFFINITY>0
|
||||||
|
""")
|
||||||
|
) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
dbDomains.add(rs.getString("DOMAIN_NAME"));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("Failed to sync domains", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Found {} domains in database", dbDomains.size());
|
||||||
|
|
||||||
|
db.syncDomains(dbDomains);
|
||||||
|
|
||||||
|
logger.info("Synced domains to sqlite");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
|
||||||
|
while (!Thread.currentThread().isInterrupted()) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Grace sleep in case we're operating on an empty domain list
|
||||||
|
TimeUnit.SECONDS.sleep(15);
|
||||||
|
|
||||||
|
syncDomains();
|
||||||
|
var domains = db.getScheduledDomains();
|
||||||
|
|
||||||
|
for (var domain : domains) {
|
||||||
|
updateDomain(client, domain);
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
logger.info("DomSampleService interrupted, stopping...");
|
||||||
|
return;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error in DomSampleService run loop", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateDomain(BrowserlessClient client, String domain) {
|
||||||
|
var rootUrl = "https://" + domain + "/";
|
||||||
|
try {
|
||||||
|
var content = client.annotatedContent(rootUrl, new BrowserlessClient.GotoOptions("load", Duration.ofSeconds(10).toMillis()));
|
||||||
|
|
||||||
|
if (content.isPresent()) {
|
||||||
|
db.saveSample(domain, rootUrl, content.get());
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Failed to process domain: " + domain, e);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
db.flagDomainAsFetched(domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,174 @@
|
|||||||
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class DomSampleDb implements AutoCloseable {
|
||||||
|
private static final String dbFileName = "dom-sample.db";
|
||||||
|
private final Connection connection;
|
||||||
|
|
||||||
|
public DomSampleDb() throws SQLException{
|
||||||
|
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||||
|
}
|
||||||
|
|
||||||
|
public DomSampleDb(Path dbPath) throws SQLException {
|
||||||
|
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||||
|
|
||||||
|
connection = DriverManager.getConnection(dbUrl);
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||||
|
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||||
|
stmt.execute("PRAGMA journal_mode=WAL");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void syncDomains(Set<String> domains) {
|
||||||
|
Set<String> currentDomains = new HashSet<>();
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
currentDomains.add(rs.getString("domain"));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to sync domains", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> toRemove = new HashSet<>(currentDomains);
|
||||||
|
Set<String> toAdd = new HashSet<>(domains);
|
||||||
|
|
||||||
|
toRemove.removeAll(domains);
|
||||||
|
toAdd.removeAll(currentDomains);
|
||||||
|
|
||||||
|
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
|
||||||
|
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
|
||||||
|
) {
|
||||||
|
for (String domain : toRemove) {
|
||||||
|
removeStmt.setString(1, domain);
|
||||||
|
removeStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String domain : toAdd) {
|
||||||
|
addStmt.setString(1, domain);
|
||||||
|
addStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to remove domains", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getScheduledDomains() {
|
||||||
|
List<String> domains = new ArrayList<>();
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
domains.add(rs.getString("domain"));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to get scheduled domains", e);
|
||||||
|
}
|
||||||
|
return domains;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void flagDomainAsFetched(String domain) {
|
||||||
|
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to flag domain as fetched", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||||
|
|
||||||
|
public List<Sample> getSamples(String domain) throws SQLException {
|
||||||
|
List<Sample> samples = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT url, sample, requests, accepted_popover
|
||||||
|
FROM samples
|
||||||
|
WHERE domain = ?
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
samples.add(
|
||||||
|
new Sample(
|
||||||
|
rs.getString("url"),
|
||||||
|
domain,
|
||||||
|
rs.getString("sample"),
|
||||||
|
rs.getString("requests"),
|
||||||
|
rs.getBoolean("accepted_popover")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||||
|
var doc = Jsoup.parse(rawContent);
|
||||||
|
|
||||||
|
var networkRequests = doc.getElementById("marginalia-network-requests");
|
||||||
|
|
||||||
|
boolean acceptedPopover = false;
|
||||||
|
|
||||||
|
StringBuilder requestTsv = new StringBuilder();
|
||||||
|
if (networkRequests != null) {
|
||||||
|
|
||||||
|
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
|
||||||
|
|
||||||
|
for (var request : networkRequests.getElementsByClass("network-request")) {
|
||||||
|
String method = request.attr("data-method");
|
||||||
|
String urlAttr = request.attr("data-url");
|
||||||
|
String timestamp = request.attr("data-timestamp");
|
||||||
|
|
||||||
|
requestTsv
|
||||||
|
.append(method)
|
||||||
|
.append('\t')
|
||||||
|
.append(timestamp)
|
||||||
|
.append('\t')
|
||||||
|
.append(urlAttr.replace('\n', ' '))
|
||||||
|
.append("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
networkRequests.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
doc.body().removeAttr("id");
|
||||||
|
|
||||||
|
String sample = doc.html();
|
||||||
|
|
||||||
|
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE
|
||||||
|
INTO samples (domain, url, sample, requests, accepted_popover)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.setString(2, url);
|
||||||
|
stmt.setString(3, sample);
|
||||||
|
stmt.setString(4, requests);
|
||||||
|
stmt.setBoolean(5, acceptedPopover);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws SQLException {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
}
|
@@ -8,10 +8,13 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.net.URLEncoder;
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
@@ -60,6 +63,42 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
return Optional.of(rsp.body());
|
return Optional.of(rsp.body());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
|
||||||
|
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
|
||||||
|
*/
|
||||||
|
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||||
|
Map<String, Object> requestData = Map.of(
|
||||||
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
|
"gotoOptions", gotoOptions,
|
||||||
|
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Launch parameters for the browserless instance to load the extension
|
||||||
|
Map<String, Object> launchParameters = Map.of(
|
||||||
|
"args", List.of("--load-extension=/dom-export")
|
||||||
|
);
|
||||||
|
|
||||||
|
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
var request = HttpRequest.newBuilder()
|
||||||
|
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
|
||||||
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
|
gson.toJson(requestData)
|
||||||
|
))
|
||||||
|
.header("Content-type", "application/json")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
|
||||||
|
if (rsp.statusCode() >= 300) {
|
||||||
|
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(rsp.body());
|
||||||
|
}
|
||||||
|
|
||||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
|
|
||||||
|
@@ -126,7 +126,6 @@ public class LiveCaptureGrpcService
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
EdgeDomain domain = domainNameOpt.get();
|
EdgeDomain domain = domainNameOpt.get();
|
||||||
String domainNameStr = domain.toString();
|
|
||||||
|
|
||||||
if (!isValidDomainForCapture(domain)) {
|
if (!isValidDomainForCapture(domain)) {
|
||||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||||
|
@@ -33,6 +33,7 @@ import java.sql.SQLException;
|
|||||||
import java.time.*;
|
import java.time.*;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@@ -71,7 +72,7 @@ public class FeedFetcherService {
|
|||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
CLEAN,
|
CLEAN,
|
||||||
REFRESH
|
REFRESH
|
||||||
};
|
}
|
||||||
|
|
||||||
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
||||||
if (updating) // Prevent concurrent updates
|
if (updating) // Prevent concurrent updates
|
||||||
@@ -87,6 +88,7 @@ public class FeedFetcherService {
|
|||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||||
.version(HttpClient.Version.HTTP_2)
|
.version(HttpClient.Version.HTTP_2)
|
||||||
.build();
|
.build();
|
||||||
|
ExecutorService fetchExecutor = Executors.newCachedThreadPool();
|
||||||
FeedJournal feedJournal = FeedJournal.create();
|
FeedJournal feedJournal = FeedJournal.create();
|
||||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||||
) {
|
) {
|
||||||
@@ -131,7 +133,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
FetchResult feedData;
|
FetchResult feedData;
|
||||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||||
feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
|
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
feedData = new FetchResult.TransientError();
|
feedData = new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
@@ -211,6 +213,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||||
HttpClient client,
|
HttpClient client,
|
||||||
|
ExecutorService executorService,
|
||||||
@Nullable String ifModifiedSinceDate,
|
@Nullable String ifModifiedSinceDate,
|
||||||
@Nullable String ifNoneMatchTag)
|
@Nullable String ifNoneMatchTag)
|
||||||
{
|
{
|
||||||
@@ -226,18 +229,27 @@ public class FeedFetcherService {
|
|||||||
.timeout(Duration.ofSeconds(15))
|
.timeout(Duration.ofSeconds(15))
|
||||||
;
|
;
|
||||||
|
|
||||||
if (ifModifiedSinceDate != null) {
|
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||||
|
// though since there are certain idiosyncrasies in server implementations,
|
||||||
|
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||||
|
if (ifNoneMatchTag != null) {
|
||||||
|
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||||
|
} else if (ifModifiedSinceDate != null) {
|
||||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ifNoneMatchTag != null) {
|
|
||||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
|
||||||
}
|
|
||||||
|
|
||||||
HttpRequest getRequest = requestBuilder.build();
|
HttpRequest getRequest = requestBuilder.build();
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
|
||||||
|
/* Note we need to use an executor to time-limit the send() method in HttpClient, as
|
||||||
|
* its support for timeouts only applies to the time until response starts to be received,
|
||||||
|
* and does not catch the case when the server starts to send data but then hangs.
|
||||||
|
*/
|
||||||
|
HttpResponse<byte[]> rs = executorService.submit(
|
||||||
|
() -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
|
||||||
|
.get(15, TimeUnit.SECONDS);
|
||||||
|
|
||||||
if (rs.statusCode() == 429) { // Too Many Requests
|
if (rs.statusCode() == 429) { // Too Many Requests
|
||||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||||
|
@@ -79,9 +79,17 @@ public class SimpleFeedParser {
|
|||||||
if (!link.isBlank())
|
if (!link.isBlank())
|
||||||
break;
|
break;
|
||||||
var tag = element.getElementsByTag(attr).first();
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
|
||||||
if (tag != null) {
|
if (tag != null) {
|
||||||
link = tag.text();
|
String linkText = tag.text();
|
||||||
|
|
||||||
|
if (linkText.isBlank()) {
|
||||||
|
linkText = tag.attr("href");
|
||||||
|
}
|
||||||
|
|
||||||
|
link = linkText;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.add(new ItemData(title, description, link, pubDate));
|
ret.add(new ItemData(title, description, link, pubDate));
|
||||||
|
@@ -0,0 +1,113 @@
|
|||||||
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class DomSampleDbTest {
|
||||||
|
Path tempDir;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws Exception {
|
||||||
|
tempDir = Files.createTempDirectory("test");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSetUp() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to set up database: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSyncDomains() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||||
|
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com"));
|
||||||
|
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
db.syncDomains(Set.of("foobar.com", "test.com"));
|
||||||
|
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to sync domains: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFetchDomains() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||||
|
db.flagDomainAsFetched("example.com");
|
||||||
|
db.flagDomainAsFetched("test.com");
|
||||||
|
db.flagDomainAsFetched("foobar.com");
|
||||||
|
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
|
||||||
|
db.flagDomainAsFetched("test.com");
|
||||||
|
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to sync domains: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void saveLoadSingle() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
|
||||||
|
var samples = db.getSamples("example.com");
|
||||||
|
assertEquals(1, samples.size());
|
||||||
|
var sample = samples.getFirst();
|
||||||
|
assertEquals("example.com", sample.domain());
|
||||||
|
assertEquals("http://example.com/sample", sample.url());
|
||||||
|
assertEquals("sample data", sample.sample());
|
||||||
|
assertEquals("requests data", sample.requests());
|
||||||
|
assertTrue(sample.acceptedPopover());
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to save/load sample: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void saveLoadTwo() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
|
||||||
|
var samples = db.getSamples("example.com");
|
||||||
|
assertEquals(2, samples.size());
|
||||||
|
|
||||||
|
Map<String, String> samplesByUrl = new HashMap<>();
|
||||||
|
for (var sample : samples) {
|
||||||
|
samplesByUrl.put(sample.url(), sample.sample());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
|
||||||
|
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to save/load sample: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -3,17 +3,21 @@ package nu.marginalia.livecapture;
|
|||||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.testcontainers.containers.GenericContainer;
|
import org.testcontainers.containers.GenericContainer;
|
||||||
|
import org.testcontainers.images.PullPolicy;
|
||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.DockerImageName;
|
import org.testcontainers.utility.DockerImageName;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||||
@@ -22,9 +26,14 @@ import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
|||||||
@Testcontainers
|
@Testcontainers
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
public class BrowserlessClientTest {
|
public class BrowserlessClientTest {
|
||||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
// Run gradle docker if this image is not available
|
||||||
|
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
|
||||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||||
|
.withImagePullPolicy(PullPolicy.defaultPolicy())
|
||||||
.withNetworkMode("bridge")
|
.withNetworkMode("bridge")
|
||||||
|
.withLogConsumer(frame -> {
|
||||||
|
System.out.print(frame.getUtf8String());
|
||||||
|
})
|
||||||
.withExposedPorts(3000);
|
.withExposedPorts(3000);
|
||||||
|
|
||||||
static WireMockServer wireMockServer =
|
static WireMockServer wireMockServer =
|
||||||
@@ -34,6 +43,7 @@ public class BrowserlessClientTest {
|
|||||||
static String localIp;
|
static String localIp;
|
||||||
|
|
||||||
static URI browserlessURI;
|
static URI browserlessURI;
|
||||||
|
static URI browserlessWssURI;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setup() throws IOException {
|
public static void setup() throws IOException {
|
||||||
@@ -44,6 +54,12 @@ public class BrowserlessClientTest {
|
|||||||
container.getMappedPort(3000))
|
container.getMappedPort(3000))
|
||||||
);
|
);
|
||||||
|
|
||||||
|
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
|
||||||
|
container.getHost(),
|
||||||
|
container.getMappedPort(3000))
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
wireMockServer.start();
|
wireMockServer.start();
|
||||||
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||||
|
|
||||||
@@ -85,6 +101,30 @@ public class BrowserlessClientTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAnnotatedContent() throws Exception {
|
||||||
|
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI);
|
||||||
|
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||||
|
) {
|
||||||
|
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||||
|
dbop.saveSample("marginalia.nu", "https://marginalia.nu/", content);
|
||||||
|
System.out.println(content);
|
||||||
|
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||||
|
|
||||||
|
dbop.getSamples("marginalia.nu").forEach(sample -> {
|
||||||
|
System.out.println("Sample URL: " + sample.url());
|
||||||
|
System.out.println("Sample Content: " + sample.sample());
|
||||||
|
System.out.println("Sample Requests: " + sample.requests());
|
||||||
|
System.out.println("Accepted Popover: " + sample.acceptedPopover());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testScreenshot() throws Exception {
|
public void testScreenshot() throws Exception {
|
||||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results;
|
package nu.marginalia.api.searchquery.model.results;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@@ -161,4 +162,14 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
|||||||
public String toString() {
|
public String toString() {
|
||||||
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getShortFormat() {
|
||||||
|
try {
|
||||||
|
var df = DocumentFormat.valueOf(format);
|
||||||
|
return df.shortFormat;
|
||||||
|
}
|
||||||
|
catch (IllegalArgumentException e) {
|
||||||
|
return DocumentFormat.UNKNOWN.shortFormat;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
|
|||||||
|
|
||||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||||
|
|
||||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
|
||||||
for (var instance : journal.pages()) {
|
for (var instance : journal.pages()) {
|
||||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
||||||
{
|
{
|
||||||
|
@@ -23,16 +23,33 @@ public class SimpleBlockingThreadPool {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
|
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
|
||||||
|
|
||||||
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
|
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
|
||||||
|
this(name, poolSize, queueSize, ThreadType.PLATFORM);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize, ThreadType threadType) {
|
||||||
tasks = new ArrayBlockingQueue<>(queueSize);
|
tasks = new ArrayBlockingQueue<>(queueSize);
|
||||||
|
|
||||||
for (int i = 0; i < poolSize; i++) {
|
for (int i = 0; i < poolSize; i++) {
|
||||||
Thread worker = new Thread(this::worker, name + "[" + i + "]");
|
|
||||||
worker.setDaemon(true);
|
Thread.Builder threadBuilder = switch (threadType) {
|
||||||
worker.start();
|
case VIRTUAL -> Thread.ofVirtual();
|
||||||
|
case PLATFORM -> Thread.ofPlatform().daemon(true);
|
||||||
|
};
|
||||||
|
|
||||||
|
Thread worker = threadBuilder
|
||||||
|
.name(name + "[" + i + "]")
|
||||||
|
.start(this::worker);
|
||||||
|
|
||||||
workers.add(worker);
|
workers.add(worker);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public enum ThreadType {
|
||||||
|
VIRTUAL,
|
||||||
|
PLATFORM
|
||||||
|
}
|
||||||
|
|
||||||
public void submit(Task task) throws InterruptedException {
|
public void submit(Task task) throws InterruptedException {
|
||||||
tasks.put(task);
|
tasks.put(task);
|
||||||
}
|
}
|
||||||
|
@@ -62,6 +62,7 @@ dependencies {
|
|||||||
implementation libs.jwarc
|
implementation libs.jwarc
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
|
implementation libs.pdfbox
|
||||||
|
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
@@ -87,6 +88,8 @@ dependencies {
|
|||||||
implementation libs.commons.compress
|
implementation libs.commons.compress
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
|
|
||||||
|
implementation libs.bundles.httpcomponents
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
@@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.converting.model;
|
package nu.marginalia.converting.model;
|
||||||
|
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
@@ -21,7 +21,7 @@ public class ProcessedDocumentDetails {
|
|||||||
public long hashCode;
|
public long hashCode;
|
||||||
|
|
||||||
public Set<HtmlFeature> features;
|
public Set<HtmlFeature> features;
|
||||||
public HtmlStandard standard;
|
public DocumentFormat format;
|
||||||
|
|
||||||
public List<EdgeUrl> linksInternal;
|
public List<EdgeUrl> linksInternal;
|
||||||
public List<EdgeUrl> linksExternal;
|
public List<EdgeUrl> linksExternal;
|
||||||
@@ -30,6 +30,6 @@ public class ProcessedDocumentDetails {
|
|||||||
public GeneratorType generator;
|
public GeneratorType generator;
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.format + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -7,6 +7,7 @@ import nu.marginalia.converting.model.DisqualifiedException;
|
|||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin;
|
||||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||||
|
import nu.marginalia.converting.processor.plugin.PdfDocumentProcessorPlugin;
|
||||||
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@@ -33,7 +34,8 @@ public class DocumentProcessor {
|
|||||||
private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
private static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||||
"application/xhtml",
|
"application/xhtml",
|
||||||
"text/html",
|
"text/html",
|
||||||
"text/plain");
|
"text/plain",
|
||||||
|
"application/pdf");
|
||||||
|
|
||||||
|
|
||||||
private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
|
private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
|
||||||
@@ -42,12 +44,14 @@ public class DocumentProcessor {
|
|||||||
@Inject
|
@Inject
|
||||||
public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
|
public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin,
|
||||||
PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
|
PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin,
|
||||||
|
PdfDocumentProcessorPlugin pdfDocumentProcessorPlugin,
|
||||||
AnchorTextKeywords anchorTextKeywords)
|
AnchorTextKeywords anchorTextKeywords)
|
||||||
{
|
{
|
||||||
this.anchorTextKeywords = anchorTextKeywords;
|
this.anchorTextKeywords = anchorTextKeywords;
|
||||||
|
|
||||||
processorPlugins.add(htmlDocumentProcessorPlugin);
|
processorPlugins.add(htmlDocumentProcessorPlugin);
|
||||||
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
||||||
|
processorPlugins.add(pdfDocumentProcessorPlugin);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ProcessedDocument process(CrawledDocument crawledDocument,
|
public ProcessedDocument process(CrawledDocument crawledDocument,
|
||||||
|
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.logic;
|
|||||||
|
|
||||||
import crawlercommons.utils.Strings;
|
import crawlercommons.utils.Strings;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@@ -17,7 +17,7 @@ import java.util.Set;
|
|||||||
public class DocumentValuator {
|
public class DocumentValuator {
|
||||||
|
|
||||||
public double getQuality(CrawledDocument crawledDocument,
|
public double getQuality(CrawledDocument crawledDocument,
|
||||||
HtmlStandard htmlStandard,
|
DocumentFormat htmlStandard,
|
||||||
Document parsedDocument,
|
Document parsedDocument,
|
||||||
int textLength) throws DisqualifiedException {
|
int textLength) throws DisqualifiedException {
|
||||||
|
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.logic;
|
package nu.marginalia.converting.processor.logic;
|
||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.DocumentType;
|
import org.jsoup.nodes.DocumentType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -12,54 +12,54 @@ public class HtmlStandardExtractor {
|
|||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
private static final Logger logger = LoggerFactory.getLogger(HtmlStandardExtractor.class);
|
||||||
|
|
||||||
public static HtmlStandard parseDocType(DocumentType docType) {
|
public static DocumentFormat parseDocType(DocumentType docType) {
|
||||||
if (null == docType) {
|
if (null == docType) {
|
||||||
return HtmlStandard.UNKNOWN;
|
return DocumentFormat.UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
String publicId = docType.publicId();
|
String publicId = docType.publicId();
|
||||||
if (Strings.isNullOrEmpty(publicId))
|
if (Strings.isNullOrEmpty(publicId))
|
||||||
return HtmlStandard.HTML5;
|
return DocumentFormat.HTML5;
|
||||||
|
|
||||||
publicId = publicId.toUpperCase();
|
publicId = publicId.toUpperCase();
|
||||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 4")) {
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
}
|
}
|
||||||
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
if (publicId.startsWith("-//SOFTQUAD SOFTWARE//DTD") && publicId.contains("HTML 3")) {
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
}
|
}
|
||||||
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
if (publicId.startsWith("-//INTERNET/RFC XXXX//EN"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
if (publicId.startsWith("-//NETSCAPE COMM. CORP"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
if (publicId.startsWith("-//SQ//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
if (publicId.startsWith("-//SOFTQUAD//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
if (publicId.startsWith("-//W3O//DTD W3 HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
if (publicId.startsWith("-//IETF//DTD HTML 2"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
if (publicId.startsWith("-//IETF//DTD HTML//EN"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
if (publicId.startsWith("-/W3C//DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
if (publicId.startsWith("-/W3C/DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
if (publicId.startsWith("-//IETF//DTD HTML 3"))
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
if (publicId.startsWith("-//W3C//DTD XHTML"))
|
||||||
return HtmlStandard.XHTML;
|
return DocumentFormat.XHTML;
|
||||||
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
if (publicId.startsWith("ISO/IEC 15445:2000//DTD"))
|
||||||
return HtmlStandard.XHTML;
|
return DocumentFormat.XHTML;
|
||||||
if (publicId.startsWith("-//W3C//DTD HTML"))
|
if (publicId.startsWith("-//W3C//DTD HTML"))
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
|
|
||||||
logger.debug("Unknown publicID standard {}", publicId);
|
logger.debug("Unknown publicID standard {}", publicId);
|
||||||
return HtmlStandard.UNKNOWN;
|
return DocumentFormat.UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HtmlStandard sniffHtmlStandard(Document parsed) {
|
public static DocumentFormat sniffHtmlStandard(Document parsed) {
|
||||||
int html4Attributes = 0;
|
int html4Attributes = 0;
|
||||||
int html5Attributes = 0;
|
int html5Attributes = 0;
|
||||||
|
|
||||||
@@ -73,11 +73,11 @@ public class HtmlStandardExtractor {
|
|||||||
html4Attributes++;
|
html4Attributes++;
|
||||||
}
|
}
|
||||||
if (html5Attributes > 0) {
|
if (html5Attributes > 0) {
|
||||||
return HtmlStandard.HTML5;
|
return DocumentFormat.HTML5;
|
||||||
}
|
}
|
||||||
if (html4Attributes > 0) {
|
if (html4Attributes > 0) {
|
||||||
return HtmlStandard.HTML4;
|
return DocumentFormat.HTML4;
|
||||||
}
|
}
|
||||||
return HtmlStandard.HTML123;
|
return DocumentFormat.HTML123;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -7,11 +7,11 @@ import nu.marginalia.keyword.LinkTexts;
|
|||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.filter.LanguageFilter;
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -73,7 +73,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public MetaTagsBuilder addFormat(HtmlStandard standard) {
|
public MetaTagsBuilder addFormat(DocumentFormat standard) {
|
||||||
|
|
||||||
add("format", standard);
|
add("format", standard);
|
||||||
|
|
||||||
|
@@ -25,12 +25,12 @@ import nu.marginalia.language.model.DocumentLanguageData;
|
|||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
import nu.marginalia.link_parser.FeedExtractor;
|
import nu.marginalia.link_parser.FeedExtractor;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@@ -137,8 +137,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
|
|
||||||
final int length = getLength(doc);
|
final int length = getLength(doc);
|
||||||
final HtmlStandard standard = getHtmlStandard(doc);
|
final DocumentFormat format = getDocumentFormat(doc);
|
||||||
final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length);
|
final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);
|
||||||
|
|
||||||
if (isDisqualified(documentClass, url, quality, doc.title())) {
|
if (isDisqualified(documentClass, url, quality, doc.title())) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||||
@@ -152,7 +152,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
var ret = new ProcessedDocumentDetails();
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
ret.length = length;
|
ret.length = length;
|
||||||
ret.standard = standard;
|
ret.format = format;
|
||||||
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
|
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
|
||||||
|
|
||||||
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
|
||||||
@@ -161,7 +161,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
ret.quality = documentValuator.adjustQuality(quality, features);
|
ret.quality = documentValuator.adjustQuality(quality, features);
|
||||||
ret.hashCode = dld.localitySensitiveHashCode();
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, format, true);
|
||||||
|
|
||||||
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
EnumSet<DocumentFlags> documentFlags = documentFlags(features, generatorParts.type());
|
||||||
|
|
||||||
@@ -180,7 +180,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
.addPubDate(pubDate)
|
.addPubDate(pubDate)
|
||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(features)
|
.addFeatures(features)
|
||||||
.addFormat(standard)
|
.addFormat(format)
|
||||||
.addGenerator(generatorParts.keywords())
|
.addGenerator(generatorParts.keywords())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
@@ -316,12 +316,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
return linkTerms;
|
return linkTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
private HtmlStandard getHtmlStandard(Document doc) {
|
private DocumentFormat getDocumentFormat(Document doc) {
|
||||||
HtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(doc.documentType());
|
DocumentFormat format = HtmlStandardExtractor.parseDocType(doc.documentType());
|
||||||
if (HtmlStandard.UNKNOWN.equals(htmlStandard)) {
|
if (DocumentFormat.UNKNOWN.equals(format)) {
|
||||||
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
return HtmlStandardExtractor.sniffHtmlStandard(doc);
|
||||||
}
|
}
|
||||||
return htmlStandard;
|
return format;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getLength(Document doc) {
|
private int getLength(Document doc) {
|
||||||
|
@@ -0,0 +1,286 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||||
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||||
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
|
import org.apache.pdfbox.text.HeadingAwarePDFTextStripper;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||||
|
|
||||||
|
private final int maxTitleLength;
|
||||||
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
|
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||||
|
private final DocumentLengthLogic documentLengthLogic;
|
||||||
|
private final DefaultSpecialization defaultSpecialization;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PdfDocumentProcessorPlugin.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||||
|
LanguageFilter languageFilter,
|
||||||
|
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||||
|
DocumentKeywordExtractor keywordExtractor,
|
||||||
|
DocumentLengthLogic documentLengthLogic,
|
||||||
|
DefaultSpecialization defaultSpecialization)
|
||||||
|
|
||||||
|
{
|
||||||
|
super(languageFilter);
|
||||||
|
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||||
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
|
this.maxTitleLength = maxTitleLength;
|
||||||
|
this.keywordExtractor = keywordExtractor;
|
||||||
|
this.defaultSpecialization = defaultSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isApplicable(CrawledDocument doc) {
|
||||||
|
String contentType = doc.contentType.toLowerCase();
|
||||||
|
|
||||||
|
if (contentType.equals("application/pdf"))
|
||||||
|
return true;
|
||||||
|
if (contentType.startsWith("application/pdf;")) // charset=blabla
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DetailsWithWords createDetails(CrawledDocument crawledDocument,
|
||||||
|
LinkTexts linkTexts,
|
||||||
|
DocumentClass documentClass)
|
||||||
|
throws DisqualifiedException, URISyntaxException, IOException {
|
||||||
|
|
||||||
|
String documentBody = crawledDocument.documentBody();
|
||||||
|
|
||||||
|
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||||
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||||
|
|
||||||
|
|
||||||
|
Document doc;
|
||||||
|
try {
|
||||||
|
doc = convertPdfToHtml(crawledDocument.documentBodyBytes);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("Failed to convert PDF file {} - {}", url, e.getMessage());
|
||||||
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(doc);
|
||||||
|
|
||||||
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
|
documentLengthLogic.validateLength(dld, 1.0);
|
||||||
|
|
||||||
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
|
ret.length = documentBody.length();
|
||||||
|
|
||||||
|
ret.format = DocumentFormat.PDF;
|
||||||
|
ret.title = StringUtils.truncate(defaultSpecialization.getTitle(doc, dld, url.toString()), maxTitleLength);
|
||||||
|
|
||||||
|
ret.quality = -5;
|
||||||
|
|
||||||
|
ret.features = Set.of(HtmlFeature.PDF);
|
||||||
|
ret.description = getDescription(doc);
|
||||||
|
ret.hashCode = dld.localitySensitiveHashCode();
|
||||||
|
|
||||||
|
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
||||||
|
|
||||||
|
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PdfFile);
|
||||||
|
|
||||||
|
ret.metadata = new DocumentMetadata(
|
||||||
|
documentLengthLogic.getEncodedAverageLength(dld),
|
||||||
|
pubDate.yearByte(),
|
||||||
|
(int) -ret.quality,
|
||||||
|
documentFlags);
|
||||||
|
|
||||||
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url);
|
||||||
|
|
||||||
|
var tagWords = new MetaTagsBuilder()
|
||||||
|
.addPubDate(pubDate)
|
||||||
|
.addUrl(url)
|
||||||
|
.addFeatures(ret.features)
|
||||||
|
.addFormat(ret.format)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
|
||||||
|
if (pubDate.hasYear()) {
|
||||||
|
ret.pubYear = pubDate.year();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* These are assumed to be populated */
|
||||||
|
ret.linksInternal = new ArrayList<>();
|
||||||
|
ret.linksExternal = new ArrayList<>();
|
||||||
|
|
||||||
|
return new DetailsWithWords(ret, words);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getDescription(Document doc) {
|
||||||
|
int cnt = 0;
|
||||||
|
boolean useNext = false;
|
||||||
|
for (var ptag : doc.getElementsByTag("p")) {
|
||||||
|
String text = ptag.text();
|
||||||
|
|
||||||
|
// Many academic documents have an abstract at the start of the document,
|
||||||
|
// which makes a nice summary. Though they tend to bleed into the text,
|
||||||
|
// so we check for the word "Abstract" at the start of the paragraph.
|
||||||
|
|
||||||
|
if (text.startsWith("Abstract ")) {
|
||||||
|
return StringUtils.abbreviate(text.substring("Abstract ".length()), "...", 255);
|
||||||
|
}
|
||||||
|
else if (text.equals("Abstract")) {
|
||||||
|
useNext = true;
|
||||||
|
}
|
||||||
|
else if (useNext) {
|
||||||
|
return StringUtils.abbreviate(text, "...", 255);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (++cnt > 15) { // Don't scan the entire document
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to the default specialization
|
||||||
|
return defaultSpecialization.getSummary(doc, Set.of());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Convert the provided PDF bytes into a HTML rendering that can be fed
|
||||||
|
* to the HTML processor.
|
||||||
|
*/
|
||||||
|
Document convertPdfToHtml(byte[] pdfBytes) throws IOException {
|
||||||
|
try (var doc = Loader.loadPDF(pdfBytes)) {
|
||||||
|
String docMetaTitle = Objects.requireNonNullElse(doc.getDocumentInformation().getTitle(), "");
|
||||||
|
|
||||||
|
var stripper = new HeadingAwarePDFTextStripper();
|
||||||
|
stripper.setStartPage(1);
|
||||||
|
stripper.setSortByPosition(true);
|
||||||
|
stripper.setWordSeparator(" ");
|
||||||
|
|
||||||
|
// Increase the tolerance for line spacing to deal better with paragraphs.
|
||||||
|
stripper.setDropThreshold(5f);
|
||||||
|
|
||||||
|
stripper.setPageStart("<div>");
|
||||||
|
stripper.setParagraphStart("<p>");
|
||||||
|
stripper.setParagraphEnd("</p>\n");
|
||||||
|
stripper.setPageEnd("</div>\n");
|
||||||
|
stripper.setHeadingStart("<h1>");
|
||||||
|
stripper.setHeadingEnd("</h1>\n");
|
||||||
|
stripper.setLineSeparator("\n");
|
||||||
|
|
||||||
|
String text = stripper.getText(doc);
|
||||||
|
|
||||||
|
StringBuilder htmlBuilder = new StringBuilder(text.length() + 1024);
|
||||||
|
htmlBuilder.append("<html><body>")
|
||||||
|
.append(text)
|
||||||
|
.append("</body></html>");
|
||||||
|
|
||||||
|
var parsed = Jsoup.parse(htmlBuilder.toString());
|
||||||
|
|
||||||
|
repairDOM(parsed);
|
||||||
|
|
||||||
|
for (var heading : parsed.getElementsByTag("h1")) {
|
||||||
|
String headingText = heading.text();
|
||||||
|
if (headingText.length() > 2) {
|
||||||
|
parsed.title(headingText);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (parsed.title().isEmpty()) {
|
||||||
|
// Prefer setting the title to the first paragraph in the
|
||||||
|
// document, as this is almost always correct. Otherwise,
|
||||||
|
// we fall back on the metadata title, which is almost always
|
||||||
|
// useless
|
||||||
|
|
||||||
|
var firstP = parsed.getElementsByTag("p").first();
|
||||||
|
if (firstP != null) parsed.title(firstP.text());
|
||||||
|
else parsed.title(docMetaTitle);
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Repair the DOM to remove some common issues with PDF conversion,
|
||||||
|
* including empty paragraphs, and multiline headers that are split into multiple
|
||||||
|
* conescutive h1 tags.
|
||||||
|
*/
|
||||||
|
private void repairDOM(Document parsed) {
|
||||||
|
|
||||||
|
// <p><h1>...</h1></p> -> <h1>...</h1>
|
||||||
|
parsed.getElementsByTag("h1").forEach(h1 -> {
|
||||||
|
var parent = h1.parent();
|
||||||
|
if (parent == null || !"p".equals(parent.tagName())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parent.childrenSize() == 1) {
|
||||||
|
parent.replaceWith(h1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove empty <p> tags
|
||||||
|
parsed.getElementsByTag("p").forEach(p -> {
|
||||||
|
if (p.childrenSize() == 0 && !p.hasText()) {
|
||||||
|
p.remove();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// <h1>...</h1><h1>...</h1> -> <h1>...</h1>
|
||||||
|
parsed.getElementsByTag("h1").forEach(h1 -> {
|
||||||
|
var nextSibling = h1.nextElementSibling();
|
||||||
|
if (nextSibling == null || !"h1".equals(nextSibling.tagName())) {
|
||||||
|
return; // Short-circuit to avoid unnecessary work
|
||||||
|
}
|
||||||
|
|
||||||
|
StringJoiner joiner = new StringJoiner(" ");
|
||||||
|
joiner.add(h1.text());
|
||||||
|
|
||||||
|
for (var sibling : h1.nextElementSiblings()) {
|
||||||
|
if (!"h1".equals(sibling.tagName()))
|
||||||
|
break;
|
||||||
|
joiner.add(sibling.text());
|
||||||
|
sibling.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
h1.text(joiner.toString());
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -13,10 +13,10 @@ import nu.marginalia.keyword.LinkTexts;
|
|||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.filter.LanguageFilter;
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
@@ -91,7 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
ret.length = documentBody.length();
|
ret.length = documentBody.length();
|
||||||
|
|
||||||
ret.standard = HtmlStandard.PLAIN;
|
ret.format = DocumentFormat.PLAIN;
|
||||||
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
||||||
|
|
||||||
ret.quality = -1;
|
ret.quality = -1;
|
||||||
@@ -113,7 +113,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
.addPubDate(pubDate)
|
.addPubDate(pubDate)
|
||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(ret.features)
|
.addFeatures(ret.features)
|
||||||
.addFormat(ret.standard)
|
.addFormat(ret.format)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
words.addAllSyntheticTerms(tagWords);
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
@@ -1,12 +1,13 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
import nu.marginalia.model.DocumentFormat;
|
||||||
|
|
||||||
public class PubDateFromHtmlStandard {
|
public class PubDateFromHtmlStandard {
|
||||||
/** Used to bias pub date heuristics */
|
/** Used to bias pub date heuristics */
|
||||||
public static int blindGuess(HtmlStandard standard) {
|
public static int blindGuess(DocumentFormat format) {
|
||||||
return switch (standard) {
|
return switch (format) {
|
||||||
case PLAIN -> 1993;
|
case PLAIN -> 1993;
|
||||||
|
case PDF -> 2010;
|
||||||
case HTML123 -> 1997;
|
case HTML123 -> 1997;
|
||||||
case HTML4, XHTML -> 2006;
|
case HTML4, XHTML -> 2006;
|
||||||
case HTML5 -> 2018;
|
case HTML5 -> 2018;
|
||||||
@@ -21,8 +22,8 @@ public class PubDateFromHtmlStandard {
|
|||||||
* Discovering publication year involves a lot of guesswork, this helps
|
* Discovering publication year involves a lot of guesswork, this helps
|
||||||
* keep the guesses relatively sane.
|
* keep the guesses relatively sane.
|
||||||
*/
|
*/
|
||||||
public static boolean isGuessPlausible(HtmlStandard standard, int year) {
|
public static boolean isGuessPlausible(DocumentFormat format, int year) {
|
||||||
switch (standard) {
|
switch (format) {
|
||||||
case HTML123:
|
case HTML123:
|
||||||
return year <= 2000;
|
return year <= 2000;
|
||||||
case XHTML:
|
case XHTML:
|
||||||
|
@@ -1,14 +1,14 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
public interface PubDateHeuristic {
|
public interface PubDateHeuristic {
|
||||||
|
|
||||||
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard);
|
Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard);
|
||||||
}
|
}
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.converting.processor.pubdate;
|
package nu.marginalia.converting.processor.pubdate;
|
||||||
|
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
|
|
||||||
import java.time.DateTimeException;
|
import java.time.DateTimeException;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
@@ -26,7 +26,7 @@ public class PubDateParser {
|
|||||||
.filter(PubDateParser::validateDate);
|
.filter(PubDateParser::validateDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Optional<PubDate> attemptParseDate(String date, HtmlStandard standard) {
|
public static Optional<PubDate> attemptParseDate(String date, DocumentFormat standard) {
|
||||||
return Optional.ofNullable(date)
|
return Optional.ofNullable(date)
|
||||||
.filter(str -> str.length() >= 4 && str.length() < 32)
|
.filter(str -> str.length() >= 4 && str.length() < 32)
|
||||||
.flatMap(str ->
|
.flatMap(str ->
|
||||||
@@ -81,7 +81,7 @@ public class PubDateParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, HtmlStandard standard) {
|
public static Optional<PubDate> dateFromHighestYearLookingSubstringWithGuess(String maybe, DocumentFormat standard) {
|
||||||
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
int guess = PubDateFromHtmlStandard.blindGuess(standard);
|
||||||
|
|
||||||
var matcher = yearPattern.matcher(maybe);
|
var matcher = yearPattern.matcher(maybe);
|
||||||
@@ -135,7 +135,7 @@ public class PubDateParser {
|
|||||||
return (max + min) / 2;
|
return (max + min) / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int guessYear(HtmlStandard standard) {
|
public static int guessYear(DocumentFormat standard) {
|
||||||
// Create some jitter to avoid having documents piling up in the same four years
|
// Create some jitter to avoid having documents piling up in the same four years
|
||||||
// as this would make searching in those years disproportionately useless
|
// as this would make searching in those years disproportionately useless
|
||||||
|
|
||||||
|
@@ -2,9 +2,9 @@ package nu.marginalia.converting.processor.pubdate;
|
|||||||
|
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
import nu.marginalia.converting.processor.pubdate.heuristic.*;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@@ -38,7 +38,7 @@ public class PubDateSniffer {
|
|||||||
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
heuristics.add(new PubDateHeuristicGuessFromHtmlStandard());
|
||||||
}
|
}
|
||||||
|
|
||||||
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard, boolean runExpensive) {
|
public PubDate getPubDate(DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard, boolean runExpensive) {
|
||||||
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
final PubDateEffortLevel effortLevel = runExpensive ? PubDateEffortLevel.HIGH : PubDateEffortLevel.LOW;
|
||||||
|
|
||||||
for (var heuristic : heuristics) {
|
for (var heuristic : heuristics) {
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@@ -19,7 +19,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
|
|
||||||
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
private static class DateExtractingNodeVisitorPass implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final HtmlStandard htmlStandard;
|
private final DocumentFormat htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitorPass(DocumentFormat htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -135,7 +135,7 @@ public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void parse(String text) {
|
private void parse(String text) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
if (htmlStandard == DocumentFormat.UNKNOWN) {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstring(text)
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
||||||
|
@@ -5,9 +5,9 @@ import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Node;
|
import org.jsoup.nodes.Node;
|
||||||
@@ -19,7 +19,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (effortLevel == PubDateEffortLevel.LOW)
|
if (effortLevel == PubDateEffortLevel.LOW)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
@@ -33,9 +33,9 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
|
|
||||||
private static class DateExtractingNodeVisitor implements NodeFilter {
|
private static class DateExtractingNodeVisitor implements NodeFilter {
|
||||||
public PubDate pubDate;
|
public PubDate pubDate;
|
||||||
private final HtmlStandard htmlStandard;
|
private final DocumentFormat htmlStandard;
|
||||||
|
|
||||||
private DateExtractingNodeVisitor(HtmlStandard htmlStandard) {
|
private DateExtractingNodeVisitor(DocumentFormat htmlStandard) {
|
||||||
this.htmlStandard = htmlStandard;
|
this.htmlStandard = htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,7 +73,7 @@ public class PubDateHeuristicDOMParsingPass2 implements PubDateHeuristic {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void parse(String text) {
|
private void parse(String text) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN) {
|
if (htmlStandard == DocumentFormat.UNKNOWN) {
|
||||||
PubDateParser
|
PubDateParser
|
||||||
.dateFromHighestYearLookingSubstring(text)
|
.dateFromHighestYearLookingSubstring(text)
|
||||||
.ifPresent(this::setPubDate);
|
.ifPresent(this::setPubDate);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,8 +14,8 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
public class PubDateHeuristicGuessFromHtmlStandard implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
if (htmlStandard == HtmlStandard.UNKNOWN)
|
if (htmlStandard == DocumentFormat.UNKNOWN)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
return Optional.of(new PubDate(null, PubDateParser.guessYear(htmlStandard)));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5AnyTimeTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// HTML5, alternative approach
|
// HTML5, alternative approach
|
||||||
for (var tag : document.select("time")) {
|
for (var tag : document.select("time")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5ArticleDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// HTML5
|
// HTML5
|
||||||
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
for (var tag : document.select("time[pubdate=\"pubdate\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("datetime"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
public class PubDateHeuristicHtml5ItempropDateTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
for (var tag : document.select("time[itemprop=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
||||||
|
@@ -8,9 +8,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@@ -21,7 +21,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
public class PubDateHeuristicJSONLD implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
for (var tag : document.select("script[type=\"application/ld+json\"]")) {
|
||||||
var maybeDate = parseLdJson(tag.data())
|
var maybeDate = parseLdJson(tag.data())
|
||||||
.flatMap(PubDateParser::attemptParseDate);
|
.flatMap(PubDateParser::attemptParseDate);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -15,7 +15,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
public class PubDateHeuristicLastModified implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
List<String> lastModified = headers.get("last-modified");
|
List<String> lastModified = headers.get("last-modified");
|
||||||
if (lastModified.isEmpty())
|
if (lastModified.isEmpty())
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
public class PubDateHeuristicMicrodata implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
|
|
||||||
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
for (var tag : document.select("meta[itemprop=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
public class PubDateHeuristicOpenGraph implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
// OG
|
// OG
|
||||||
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
for (var tag : document.select("meta[property=\"article:published_time\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -14,7 +14,7 @@ import java.util.Optional;
|
|||||||
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
public class PubDateHeuristicRDFaTag implements PubDateHeuristic {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
for (var tag : document.select("meta[property=\"datePublished\"]")) {
|
||||||
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
var maybeDate = PubDateParser.attemptParseDate(tag.attr("content"));
|
||||||
if (maybeDate.isPresent()) {
|
if (maybeDate.isPresent()) {
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -21,7 +21,7 @@ public class PubDateHeuristicUrlPatternPass1 implements PubDateHeuristic {
|
|||||||
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
private static final int MIN_URL_PATTERN_YEAR = 2000;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url, Document document, DocumentFormat htmlStandard) {
|
||||||
final String urlString = url.path;
|
final String urlString = url.path;
|
||||||
|
|
||||||
var matcher = yearUrlPattern.matcher(urlString);
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.converting.model.DocumentHeaders;
|
|||||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||||
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
import nu.marginalia.converting.processor.pubdate.PubDateParser;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -19,7 +19,7 @@ public class PubDateHeuristicUrlPatternPass2 implements PubDateHeuristic {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
|
public Optional<PubDate> apply(PubDateEffortLevel effortLevel, DocumentHeaders headers, EdgeUrl url,
|
||||||
Document document, HtmlStandard htmlStandard) {
|
Document document, DocumentFormat htmlStandard) {
|
||||||
final String urlString = url.path;
|
final String urlString = url.path;
|
||||||
|
|
||||||
var matcher = yearUrlPattern.matcher(urlString);
|
var matcher = yearUrlPattern.matcher(urlString);
|
||||||
|
@@ -8,12 +8,12 @@ import nu.marginalia.converting.model.ProcessedDocument;
|
|||||||
import nu.marginalia.converting.processor.DocumentClass;
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
@@ -53,6 +53,7 @@ public class SideloaderProcessing {
|
|||||||
"",
|
"",
|
||||||
body.getBytes(StandardCharsets.UTF_8),
|
body.getBytes(StandardCharsets.UTF_8),
|
||||||
false,
|
false,
|
||||||
|
-1,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
@@ -83,7 +84,7 @@ public class SideloaderProcessing {
|
|||||||
// that we can't get from the sideloaded data since it's
|
// that we can't get from the sideloaded data since it's
|
||||||
// so stripped down
|
// so stripped down
|
||||||
|
|
||||||
ret.details.standard = HtmlStandard.HTML5;
|
ret.details.format = DocumentFormat.HTML5;
|
||||||
ret.details.pubYear = pubYear;
|
ret.details.pubYear = pubYear;
|
||||||
ret.details.features.add(HtmlFeature.JS);
|
ret.details.features.add(HtmlFeature.JS);
|
||||||
ret.details.features.add(HtmlFeature.TRACKING);
|
ret.details.features.add(HtmlFeature.TRACKING);
|
||||||
|
@@ -9,13 +9,13 @@ import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
|||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.keyword.LinkTexts;
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
@@ -165,7 +165,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
ret.details.description = StringUtils.truncate(doc.body().text(), 255);
|
ret.details.description = StringUtils.truncate(doc.body().text(), 255);
|
||||||
ret.details.length = 128;
|
ret.details.length = 128;
|
||||||
|
|
||||||
ret.details.standard = HtmlStandard.HTML5;
|
ret.details.format = DocumentFormat.HTML5;
|
||||||
ret.details.linksExternal = List.of();
|
ret.details.linksExternal = List.of();
|
||||||
ret.details.linksInternal = List.of();
|
ret.details.linksInternal = List.of();
|
||||||
ret.state = UrlIndexingState.OK;
|
ret.state = UrlIndexingState.OK;
|
||||||
|
@@ -124,7 +124,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
document.details.title,
|
document.details.title,
|
||||||
document.details.description,
|
document.details.description,
|
||||||
HtmlFeature.encode(document.details.features),
|
HtmlFeature.encode(document.details.features),
|
||||||
document.details.standard.name(),
|
document.details.format.name(),
|
||||||
document.details.length,
|
document.details.length,
|
||||||
document.details.hashCode,
|
document.details.hashCode,
|
||||||
(float) document.details.quality,
|
(float) document.details.quality,
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -6,6 +6,7 @@ import com.google.inject.Injector;
|
|||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
@@ -13,7 +14,6 @@ import nu.marginalia.model.crawl.UrlIndexingState;
|
|||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -91,7 +91,7 @@ public class ConvertingIntegrationTest {
|
|||||||
|
|
||||||
assertTrue(details.title.length() > 4);
|
assertTrue(details.title.length() > 4);
|
||||||
assertTrue(details.description.length() > 4);
|
assertTrue(details.description.length() > 4);
|
||||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
assertEquals(DocumentFormat.HTML5, details.format);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -125,7 +125,7 @@ public class ConvertingIntegrationTest {
|
|||||||
assertTrue(details.metadata.size() > 0);
|
assertTrue(details.metadata.size() > 0);
|
||||||
assertTrue(details.title.length() > 4);
|
assertTrue(details.title.length() > 4);
|
||||||
assertTrue(details.description.length() > 4);
|
assertTrue(details.description.length() > 4);
|
||||||
assertEquals(HtmlStandard.HTML5, details.standard);
|
assertEquals(DocumentFormat.HTML5, details.format);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
|
|||||||
"",
|
"",
|
||||||
readClassPathFile(p.toString()).getBytes(),
|
readClassPathFile(p.toString()).getBytes(),
|
||||||
false,
|
false,
|
||||||
|
-1,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
|
@@ -8,7 +8,6 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
|||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileName, new Cookies());
|
try (var recorder = new WarcRecorder(fileName);
|
||||||
var db = new DomainStateDb(dbTempFile))
|
var db = new DomainStateDb(dbTempFile))
|
||||||
{
|
{
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||||
|
@@ -0,0 +1,95 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.converting.processor.DocumentClass;
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||||
|
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||||
|
import nu.marginalia.converting.processor.plugin.specialization.DefaultSpecialization;
|
||||||
|
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||||
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
|
import nu.marginalia.keyword.LinkTexts;
|
||||||
|
import nu.marginalia.language.filter.LanguageFilter;
|
||||||
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
class PdfDocumentProcessorPluginTest {
|
||||||
|
static PdfDocumentProcessorPlugin plugin;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
static void setUpBeforeClass() throws Exception {
|
||||||
|
var lm = WmsaHome.getLanguageModels();
|
||||||
|
plugin = new PdfDocumentProcessorPlugin(255,
|
||||||
|
new LanguageFilter(lm),
|
||||||
|
new ThreadLocalSentenceExtractorProvider(lm),
|
||||||
|
new DocumentKeywordExtractor(new TermFrequencyDict(lm)),
|
||||||
|
new DocumentLengthLogic(100),
|
||||||
|
new DefaultSpecialization(new SummaryExtractor(
|
||||||
|
255,
|
||||||
|
new DomFilterHeuristic(255),
|
||||||
|
new TagDensityHeuristic(255),
|
||||||
|
new OpenGraphDescriptionHeuristic(),
|
||||||
|
new MetaDescriptionHeuristic(),
|
||||||
|
new FallbackHeuristic()
|
||||||
|
),
|
||||||
|
new TitleExtractor(255)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||||
|
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
|
||||||
|
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(Path file) throws Exception {
|
||||||
|
return testPdfFile(Files.readAllBytes(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] downloadPDF(String url) throws IOException, URISyntaxException {
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) new URI(url).toURL().openConnection();
|
||||||
|
try {
|
||||||
|
return conn.getInputStream().readAllBytes();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
} finally {
|
||||||
|
conn.disconnect();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testingTool() throws Exception {
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample2.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample3.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample4.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample5.pdf")).details().title);
|
||||||
|
System.out.println(testPdfFile(Path.of("/home/st_work/Work/sample6.pdf")).details().title);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testingTool2() throws Exception {
|
||||||
|
System.out.println(plugin.convertPdfToHtml(Files.readAllBytes(Path.of("/home/st_work/Work/sample6.pdf"))));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testMarginaliaSample() throws Exception {
|
||||||
|
var doc = plugin.convertPdfToHtml(downloadPDF("https://www.marginalia.nu/junk/test.pdf"));
|
||||||
|
System.out.println(doc.html());
|
||||||
|
}
|
||||||
|
}
|
@@ -3,8 +3,8 @@ package nu.marginalia.converting.processor.pubdate;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.model.DocumentHeaders;
|
import nu.marginalia.converting.model.DocumentHeaders;
|
||||||
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.html.HtmlStandard;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ class PubDateSnifferTest {
|
|||||||
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
<time pubdate="pubdate" datetime="2022-08-24">time</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -90,7 +90,7 @@ class PubDateSnifferTest {
|
|||||||
<time>2022-08-24</time>
|
<time>2022-08-24</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -106,7 +106,7 @@ class PubDateSnifferTest {
|
|||||||
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
<time class="published" datetime="July 13, 2006">July 13, 2006</time>
|
||||||
Wow, sure lor 'em boss
|
Wow, sure lor 'em boss
|
||||||
</article>
|
</article>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2006, ret.year());
|
assertEquals(2006, ret.year());
|
||||||
@@ -116,14 +116,14 @@ class PubDateSnifferTest {
|
|||||||
public void testProblemCases() throws IOException, URISyntaxException {
|
public void testProblemCases() throws IOException, URISyntaxException {
|
||||||
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
var ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||||
new EdgeUrl("https://www.example.com/"),
|
new EdgeUrl("https://www.example.com/"),
|
||||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), HtmlStandard.HTML5, true);
|
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/The Switch to Linux Begins .html"))), DocumentFormat.HTML5, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2006, ret.year());
|
assertEquals(2006, ret.year());
|
||||||
|
|
||||||
ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
ret = dateSniffer.getPubDate(new DocumentHeaders(""),
|
||||||
new EdgeUrl("https://www.example.com/"),
|
new EdgeUrl("https://www.example.com/"),
|
||||||
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), HtmlStandard.XHTML, true);
|
Jsoup.parse(Files.readString(WmsaHome.getHomePath().resolve("test-data/Black Hat USA 2010 Understanding and Deploying DNSSEC by Paul Wouters and Patrick Nauber.html"))), DocumentFormat.XHTML, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals(2010, ret.year());
|
assertEquals(2010, ret.year());
|
||||||
@@ -146,7 +146,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<meta itemprop="datePublished" content="2022-08-24" />
|
<meta itemprop="datePublished" content="2022-08-24" />
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -160,7 +160,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<meta property="datePublished" content="2022-08-24" />
|
<meta property="datePublished" content="2022-08-24" />
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-08-24", ret.dateIso8601());
|
assertEquals("2022-08-24", ret.dateIso8601());
|
||||||
@@ -174,7 +174,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
<script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script><script type="application/ld+json">{"@context":"https:\\/\\/schema.org","@type":"Article","name":"In the Year 2525","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/In_the_Year_2525","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q145269","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2004-08-24T14:39:14Z","dateModified":"2022-10-20T11:54:37Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/4\\/4a\\/In_the_Year_2525_by_Zager_and_Evans_US_vinyl_Side-A_RCA_release.png","headline":"song written and compsoed by Rick Evans, originally recorded by Zager and Evans and released in 1969"}</script>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2004-08-24", ret.dateIso8601());
|
assertEquals("2004-08-24", ret.dateIso8601());
|
||||||
@@ -188,7 +188,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
|
<script type="application/ld+json" class="aioseop-schema">{"@context":"https://schema.org","@graph":[{"@type":"Organization","@id":"https://socialnomics.net/#organization","url":"https://socialnomics.net/","name":"Socialnomics","sameAs":[]},{"@type":"WebSite","@id":"https://socialnomics.net/#website","url":"https://socialnomics.net/","name":"Socialnomics","publisher":{"@id":"https://socialnomics.net/#organization"}},{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","inLanguage":"en-US","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","isPartOf":{"@id":"https://socialnomics.net/#website"},"breadcrumb":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist"},"datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00"},{"@type":"Article","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#article","isPartOf":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"author":{"@id":"https://socialnomics.net/author/rahis-saifi/#author"},"headline":"3 Reasons Why You Should Adopt Java-based Technology For Your Business","datePublished":"2016-12-27T21:01:36-06:00","dateModified":"2016-12-22T21:02:32-06:00","commentCount":0,"mainEntityOfPage":{"@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#webpage"},"publisher":{"@id":"https://socialnomics.net/#organization"},"articleSection":"Business, business, java, Java Developers, programming languages"},{"@type":"Person","@id":"https://socialnomics.net/author/rahis-saifi/#author","name":"Rahis Saifi","sameAs":["https://www.facebook.com/RahisSaifiOfficial","https://www.twitter.com/57rahis"],"image":{"@type":"ImageObject","@id":"https://socialnomics.net/#personlogo","url":"https://secure.gravatar.com/avatar/e67f630f0b8bc87e59e111d5e955961d?s=96&d=mm&r=g","width":96,"height":96,"caption":"Rahis Saifi"}},{"@type":"BreadcrumbList","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/#breadcrumblist","itemListElement":[{"@type":"ListItem","position":1,"item":{"@type":"WebPage","@id":"https://socialnomics.net/","url":"https://socialnomics.net/","name":"Socialnomics Blog"}},{"@type":"ListItem","position":2,"item":{"@type":"WebPage","@id":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","url":"https://socialnomics.net/2016/12/27/3-reasons-why-you-should-adopt-java-based-technology-for-your-business/","name":"3 Reasons Why You Should Adopt Java-based Technology For Your Business"}}]}]}</script>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2016-12-27", ret.dateIso8601());
|
assertEquals("2016-12-27", ret.dateIso8601());
|
||||||
@@ -202,7 +202,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<title>No date in the HTML</title>
|
<title>No date in the HTML</title>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
@@ -217,7 +217,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<title>No date in the HTML</title>
|
<title>No date in the HTML</title>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertEquals("2022-02-03", ret.dateIso8601());
|
assertEquals("2022-02-03", ret.dateIso8601());
|
||||||
@@ -232,7 +232,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<p>Published 2003, updated 2022</p>
|
<p>Published 2003, updated 2022</p>
|
||||||
"""), HtmlStandard.HTML5, true);
|
"""), DocumentFormat.HTML5, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
@@ -258,7 +258,7 @@ class PubDateSnifferTest {
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
<div style="float: left;"> <b>Post subject:</b> Keyboards.</div><div style="float: right;"><span class="postdetails"><b><img src="./styles/subsilver2/imageset/icon_post_target.gif" width="12" height="9" alt="Post" title="Post" /> <a href="./viewtopic.php?p=34580&sid=cf0c13dedebb4fea1f03fa73e510cd9f#p34580">#1</a></b></span> <b>Posted:</b> Sun Oct 03, 2010 5:37 pm </div>
|
||||||
"""), HtmlStandard.UNKNOWN, true);
|
"""), DocumentFormat.UNKNOWN, true);
|
||||||
|
|
||||||
assertFalse(ret.isEmpty());
|
assertFalse(ret.isEmpty());
|
||||||
assertNull(ret.dateIso8601());
|
assertNull(ret.dateIso8601());
|
||||||
|
@@ -60,10 +60,12 @@ dependencies {
|
|||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
implementation libs.bundles.httpcomponents
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
testImplementation libs.wiremock
|
||||||
|
|
||||||
testImplementation project(':code:processes:test-data')
|
testImplementation project(':code:processes:test-data')
|
||||||
}
|
}
|
||||||
|
@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
|
|||||||
import java.security.Security;
|
import java.security.Security;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
|
private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
|
||||||
|
|
||||||
private final AtomicInteger tasksDone = new AtomicInteger(0);
|
private final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||||
private final HttpFetcherImpl fetcher;
|
private final HttpFetcherImpl fetcher;
|
||||||
|
|
||||||
@@ -103,9 +106,18 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
this.node = processConfiguration.node();
|
this.node = processConfiguration.node();
|
||||||
|
|
||||||
|
SimpleBlockingThreadPool.ThreadType threadType;
|
||||||
|
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||||
|
threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
|
||||||
|
}
|
||||||
|
|
||||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||||
Integer.getInteger("crawler.poolSize", 256),
|
Integer.getInteger("crawler.poolSize", 256),
|
||||||
1);
|
1,
|
||||||
|
threadType);
|
||||||
|
|
||||||
|
|
||||||
// Wait for the blacklist to be loaded before starting the crawl
|
// Wait for the blacklist to be loaded before starting the crawl
|
||||||
@@ -221,10 +233,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
||||||
|
|
||||||
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
|
crawlSpecRecords.sort(crawlSpecArrangement(crawlSpecRecords));
|
||||||
// so that e.g. the big domains don't get all crawled at once, or we end up
|
|
||||||
// crawling the same server in parallel from different subdomains...
|
|
||||||
Collections.shuffle(crawlSpecRecords);
|
|
||||||
|
|
||||||
// First a validation run to ensure the file is all good to parse
|
// First a validation run to ensure the file is all good to parse
|
||||||
if (crawlSpecRecords.isEmpty()) {
|
if (crawlSpecRecords.isEmpty()) {
|
||||||
@@ -248,44 +257,51 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||||
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||||
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||||
List<CrawlTask> deferredTasks = new LinkedList<>();
|
List<CrawlTask> taskList = new ArrayList<>();
|
||||||
|
|
||||||
// Create crawl tasks and submit them to the pool for execution
|
// Create crawl tasks
|
||||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
if (workLog.isJobFinished(crawlSpec.domain))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// Add to the end of the deferral list
|
var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
|
||||||
deferredTasks.addLast(new CrawlTask(
|
|
||||||
crawlSpec,
|
|
||||||
anchorTagsSource,
|
|
||||||
outputDir,
|
|
||||||
warcArchiver,
|
|
||||||
domainStateDb,
|
|
||||||
workLog));
|
|
||||||
|
|
||||||
// Start every task we currently can from the deferral list
|
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
|
||||||
deferredTasks.removeIf(task -> {
|
if (!trySubmitDeferredTask(task)) {
|
||||||
if (task.canRun()) {
|
|
||||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
|
|
||||||
return true; // task has already run, duplicate in crawl specs
|
|
||||||
}
|
|
||||||
|
|
||||||
// This blocks the caller when the pool is full
|
// Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
|
||||||
pool.submitQuietly(task);
|
retryQueue.drainTo(taskList);
|
||||||
return true;
|
taskList.removeIf(this::trySubmitDeferredTask);
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
// Then add this new task to the retry queue
|
||||||
});
|
taskList.add(task);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Schedule any lingering tasks for immediate execution
|
// Schedule viable tasks for execution until list is empty
|
||||||
for (var task : deferredTasks) {
|
for (int emptyRuns = 0;emptyRuns < 300;) {
|
||||||
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
|
boolean hasTasks = !taskList.isEmpty();
|
||||||
continue;
|
|
||||||
|
|
||||||
pool.submitQuietly(task);
|
// The order of these checks very important to avoid a race condition
|
||||||
|
// where we miss a task that is put into the retry queue
|
||||||
|
boolean hasRunningTasks = pool.getActiveCount() > 0;
|
||||||
|
boolean hasRetryTasks = !retryQueue.isEmpty();
|
||||||
|
|
||||||
|
if (hasTasks || hasRetryTasks || hasRunningTasks) {
|
||||||
|
retryQueue.drainTo(taskList);
|
||||||
|
|
||||||
|
// Try to submit any tasks that are in the retry queue (this will block if the pool is full)
|
||||||
|
taskList.removeIf(this::trySubmitDeferredTask);
|
||||||
|
|
||||||
|
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||||
|
// we might have no new viable tasks to run for hours on end
|
||||||
|
TimeUnit.MILLISECONDS.sleep(5);
|
||||||
|
} else {
|
||||||
|
// We have no tasks to run, and no tasks in the retry queue
|
||||||
|
// but we wait a bit to see if any new tasks come in via the retry queue
|
||||||
|
emptyRuns++;
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||||
@@ -312,6 +328,52 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
|
||||||
|
* we want to enqueue domains that have common top domains first, but otherwise have a random
|
||||||
|
* order.
|
||||||
|
* <p></p>
|
||||||
|
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
|
||||||
|
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
|
||||||
|
* hashcode based on the fields).
|
||||||
|
* */
|
||||||
|
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
|
||||||
|
Random r = new Random();
|
||||||
|
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
|
||||||
|
Map<String, Integer> randomOrder = new HashMap<>(records.size());
|
||||||
|
|
||||||
|
for (var spec : records) {
|
||||||
|
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
|
||||||
|
randomOrder.put(spec.domain, r.nextInt());
|
||||||
|
}
|
||||||
|
|
||||||
|
return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
|
||||||
|
.reversed()
|
||||||
|
.thenComparing(spec -> randomOrder.get(spec.domain))
|
||||||
|
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Submit a task for execution if it can be run, returns true if it was submitted
|
||||||
|
* or if it can be discarded */
|
||||||
|
private boolean trySubmitDeferredTask(CrawlTask task) {
|
||||||
|
if (!task.canRun()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
|
||||||
|
return true; // task has already run, duplicate in crawl specs
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// This blocks the caller when the pool is full
|
||||||
|
pool.submitQuietly(task);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
catch (RuntimeException ex) {
|
||||||
|
logger.error("Failed to submit task " + task.domain, ex);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
||||||
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
||||||
}
|
}
|
||||||
@@ -371,72 +433,87 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
/** Best effort indicator whether we could start this now without getting stuck in
|
/** Best effort indicator whether we could start this now without getting stuck in
|
||||||
* DomainLocks purgatory */
|
* DomainLocks purgatory */
|
||||||
public boolean canRun() {
|
public boolean canRun() {
|
||||||
return domainLocks.canLock(new EdgeDomain(domain));
|
return domainLocks.isLockableHint(new EdgeDomain(domain));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
|
|
||||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
if (workLog.isJobFinished(domain)) { // No-Op
|
||||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
logger.info("Omitting task {}, as it is already run", domain);
|
||||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
return;
|
||||||
|
|
||||||
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
|
||||||
// while writing to the same file name as before
|
|
||||||
if (Files.exists(newWarcFile)) {
|
|
||||||
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Files.deleteIfExists(tempFile);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
|
Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
|
||||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
// We don't have a lock, so we can't run this task
|
||||||
CrawlDataReference reference = getReference()
|
// we return to avoid blocking the pool for too long
|
||||||
)
|
if (lock.isEmpty()) {
|
||||||
{
|
|
||||||
// Resume the crawl if it was aborted
|
|
||||||
if (Files.exists(tempFile)) {
|
|
||||||
retriever.syncAbortedRun(tempFile);
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
|
||||||
|
|
||||||
int size;
|
|
||||||
try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
|
|
||||||
size = retriever.crawlDomain(domainLinks, reference);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete the reference crawl data if it's not the same as the new one
|
|
||||||
// (mostly a case when migrating from legacy->warc)
|
|
||||||
reference.delete();
|
|
||||||
|
|
||||||
// Convert the WARC file to Parquet
|
|
||||||
SlopCrawlDataRecord
|
|
||||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
|
||||||
|
|
||||||
// Optionally archive the WARC file if full retention is enabled,
|
|
||||||
// otherwise delete it:
|
|
||||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
|
||||||
|
|
||||||
// Mark the domain as finished in the work log
|
|
||||||
workLog.setJobToFinished(domain, slopFile.toString(), size);
|
|
||||||
|
|
||||||
// Update the progress bar
|
|
||||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
|
||||||
|
|
||||||
logger.info("Fetched {}", domain);
|
|
||||||
} catch (Exception e) {
|
|
||||||
logger.error("Error fetching domain " + domain, e);
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
// We don't need to double-count these; it's also kept int he workLog
|
|
||||||
pendingCrawlTasks.remove(domain);
|
pendingCrawlTasks.remove(domain);
|
||||||
Thread.currentThread().setName("[idle]");
|
retryQueue.put(this);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
DomainLocks.DomainLock domainLock = lock.get();
|
||||||
|
|
||||||
Files.deleteIfExists(newWarcFile);
|
try (domainLock) {
|
||||||
Files.deleteIfExists(tempFile);
|
Thread.currentThread().setName("crawling:" + domain);
|
||||||
|
|
||||||
|
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||||
|
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||||
|
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||||
|
|
||||||
|
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
||||||
|
// while writing to the same file name as before
|
||||||
|
if (Files.exists(newWarcFile)) {
|
||||||
|
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||||
|
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||||
|
CrawlDataReference reference = getReference())
|
||||||
|
{
|
||||||
|
// Resume the crawl if it was aborted
|
||||||
|
if (Files.exists(tempFile)) {
|
||||||
|
retriever.syncAbortedRun(tempFile);
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||||
|
|
||||||
|
int size = retriever.crawlDomain(domainLinks, reference);
|
||||||
|
|
||||||
|
// Delete the reference crawl data if it's not the same as the new one
|
||||||
|
// (mostly a case when migrating from legacy->warc)
|
||||||
|
reference.delete();
|
||||||
|
|
||||||
|
// Convert the WARC file to Slop
|
||||||
|
SlopCrawlDataRecord
|
||||||
|
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||||
|
|
||||||
|
// Optionally archive the WARC file if full retention is enabled,
|
||||||
|
// otherwise delete it:
|
||||||
|
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||||
|
|
||||||
|
// Mark the domain as finished in the work log
|
||||||
|
workLog.setJobToFinished(domain, slopFile.toString(), size);
|
||||||
|
|
||||||
|
// Update the progress bar
|
||||||
|
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||||
|
|
||||||
|
logger.info("Fetched {}", domain);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error fetching domain " + domain, e);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
// We don't need to double-count these; it's also kept in the workLog
|
||||||
|
pendingCrawlTasks.remove(domain);
|
||||||
|
Thread.currentThread().setName("[idle]");
|
||||||
|
|
||||||
|
Files.deleteIfExists(newWarcFile);
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -453,7 +530,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
return new CrawlDataReference(slopPath);
|
return new CrawlDataReference(slopPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -522,7 +599,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
//
|
//
|
||||||
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
||||||
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
||||||
if (!inputPath.endsWith(".parquet")) {
|
if (!inputPath.toString().endsWith(".parquet")) {
|
||||||
return inputPath;
|
return inputPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,5 +1,8 @@
|
|||||||
package nu.marginalia.crawl;
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -8,6 +11,7 @@ import java.nio.file.Path;
|
|||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.DriverManager;
|
import java.sql.DriverManager;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -21,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
private final Connection connection;
|
private final Connection connection;
|
||||||
|
|
||||||
|
|
||||||
|
public record CrawlMeta(
|
||||||
|
String domainName,
|
||||||
|
Instant lastFullCrawl,
|
||||||
|
Duration recrawlTime,
|
||||||
|
Duration crawlTime,
|
||||||
|
int recrawlErrors,
|
||||||
|
int crawlChanges,
|
||||||
|
int totalCrawlSize
|
||||||
|
) {}
|
||||||
|
|
||||||
public record SummaryRecord(
|
public record SummaryRecord(
|
||||||
String domainName,
|
String domainName,
|
||||||
Instant lastUpdated,
|
Instant lastUpdated,
|
||||||
@@ -63,7 +78,29 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
public record FaviconRecord(String contentType, byte[] imageData) {}
|
public record FaviconRecord(String contentType, byte[] imageData) {}
|
||||||
|
|
||||||
public DomainStateDb(Path filename) throws SQLException {
|
@Inject
|
||||||
|
public DomainStateDb(FileStorageService fileStorageService) throws SQLException {
|
||||||
|
this(findFilename(fileStorageService));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Path findFilename(FileStorageService fileStorageService) throws SQLException {
|
||||||
|
var fsId = fileStorageService.getOnlyActiveFileStorage(FileStorageType.CRAWL_DATA);
|
||||||
|
|
||||||
|
if (fsId.isPresent()) {
|
||||||
|
var fs = fileStorageService.getStorage(fsId.get());
|
||||||
|
return fs.asPath().resolve("domainstate.db");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public DomainStateDb(@Nullable Path filename) throws SQLException {
|
||||||
|
if (null == filename) {
|
||||||
|
connection = null;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||||
connection = DriverManager.getConnection(sqliteDbString);
|
connection = DriverManager.getConnection(sqliteDbString);
|
||||||
|
|
||||||
@@ -77,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
feedUrl TEXT
|
feedUrl TEXT
|
||||||
)
|
)
|
||||||
""");
|
""");
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_meta (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
lastFullCrawlEpochMs LONG NOT NULL,
|
||||||
|
recrawlTimeMs LONG NOT NULL,
|
||||||
|
recrawlErrors INTEGER NOT NULL,
|
||||||
|
crawlTimeMs LONG NOT NULL,
|
||||||
|
crawlChanges INTEGER NOT NULL,
|
||||||
|
totalCrawlSize INTEGER NOT NULL
|
||||||
|
)
|
||||||
|
""");
|
||||||
stmt.executeUpdate("""
|
stmt.executeUpdate("""
|
||||||
CREATE TABLE IF NOT EXISTS favicon (
|
CREATE TABLE IF NOT EXISTS favicon (
|
||||||
domain TEXT PRIMARY KEY,
|
domain TEXT PRIMARY KEY,
|
||||||
@@ -90,11 +138,18 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws SQLException {
|
public void close() throws SQLException {
|
||||||
connection.close();
|
if (connection != null) {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isAvailable() {
|
||||||
|
return connection != null;
|
||||||
|
}
|
||||||
|
|
||||||
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
||||||
|
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("""
|
try (var stmt = connection.prepareStatement("""
|
||||||
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
||||||
VALUES(?, ?, ?)
|
VALUES(?, ?, ?)
|
||||||
@@ -110,6 +165,9 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Optional<FaviconRecord> getIcon(String domain) {
|
public Optional<FaviconRecord> getIcon(String domain) {
|
||||||
|
if (connection == null)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
||||||
stmt.setString(1, domain);
|
stmt.setString(1, domain);
|
||||||
var rs = stmt.executeQuery();
|
var rs = stmt.executeQuery();
|
||||||
@@ -129,7 +187,29 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void save(CrawlMeta crawlMeta) {
|
||||||
|
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, crawlMeta.domainName());
|
||||||
|
stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
|
||||||
|
stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
|
||||||
|
stmt.setInt(4, crawlMeta.recrawlErrors);
|
||||||
|
stmt.setLong(5, crawlMeta.crawlTime.toMillis());
|
||||||
|
stmt.setInt(6, crawlMeta.crawlChanges);
|
||||||
|
stmt.setInt(7, crawlMeta.totalCrawlSize);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to insert crawl meta record", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void save(SummaryRecord record) {
|
public void save(SummaryRecord record) {
|
||||||
|
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("""
|
try (var stmt = connection.prepareStatement("""
|
||||||
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||||
VALUES (?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
@@ -145,7 +225,38 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<SummaryRecord> get(String domainName) {
|
public Optional<CrawlMeta> getMeta(String domainName) {
|
||||||
|
if (connection == null)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
|
||||||
|
FROM crawl_meta
|
||||||
|
WHERE domain = ?
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domainName);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
if (rs.next()) {
|
||||||
|
return Optional.of(new CrawlMeta(
|
||||||
|
rs.getString("domain"),
|
||||||
|
Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
|
||||||
|
Duration.ofMillis(rs.getLong("recrawlTimeMs")),
|
||||||
|
Duration.ofMillis(rs.getLong("crawlTimeMs")),
|
||||||
|
rs.getInt("recrawlErrors"),
|
||||||
|
rs.getInt("crawlChanges"),
|
||||||
|
rs.getInt("totalCrawlSize")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
} catch (SQLException ex) {
|
||||||
|
logger.error("Failed to get crawl meta record", ex);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<SummaryRecord> getSummary(String domainName) {
|
||||||
|
if (connection == null)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("""
|
try (var stmt = connection.prepareStatement("""
|
||||||
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
|
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
|
||||||
FROM summary
|
FROM summary
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import java.net.http.HttpRequest;
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
|
|
||||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||||
public record ContentTags(String etag, String lastMod) {
|
public record ContentTags(String etag, String lastMod) {
|
||||||
@@ -17,14 +17,16 @@ public record ContentTags(String etag, String lastMod) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Paints the tags onto the request builder. */
|
/** Paints the tags onto the request builder. */
|
||||||
public void paint(HttpRequest.Builder getBuilder) {
|
public void paint(HttpGet request) {
|
||||||
|
|
||||||
|
// Paint the ETag header if present,
|
||||||
|
// otherwise paint the Last-Modified header
|
||||||
|
// (but not both at the same time due to some servers not liking it)
|
||||||
|
|
||||||
if (etag != null) {
|
if (etag != null) {
|
||||||
getBuilder.header("If-None-Match", etag);
|
request.addHeader("If-None-Match", etag);
|
||||||
}
|
} else if (lastMod != null) {
|
||||||
|
request.addHeader("If-Modified-Since", lastMod);
|
||||||
if (lastMod != null) {
|
|
||||||
getBuilder.header("If-Modified-Since", lastMod);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,34 +0,0 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.CookieHandler;
|
|
||||||
import java.net.URI;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
|
|
||||||
public class Cookies extends CookieHandler {
|
|
||||||
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
|
||||||
|
|
||||||
public void clear() {
|
|
||||||
cookieJar.get().clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasCookies() {
|
|
||||||
return !cookieJar.get().isEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getCookies() {
|
|
||||||
return cookieJar.get().values().stream().flatMap(List::stream).toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
|
|
||||||
return cookieJar.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
|
|
||||||
cookieJar.get().putAll(responseHeaders);
|
|
||||||
}
|
|
||||||
}
|
|
@@ -0,0 +1,56 @@
|
|||||||
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
|
||||||
|
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||||
|
import org.apache.hc.core5.http.HttpResponse;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.StringJoiner;
|
||||||
|
|
||||||
|
public class DomainCookies {
|
||||||
|
private final Map<String, String> cookies = new HashMap<>();
|
||||||
|
|
||||||
|
public boolean hasCookies() {
|
||||||
|
return !cookies.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void updateCookieStore(HttpResponse response) {
|
||||||
|
for (var header : response.getHeaders()) {
|
||||||
|
if (header.getName().equalsIgnoreCase("Set-Cookie")) {
|
||||||
|
parseCookieHeader(header.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parseCookieHeader(String value) {
|
||||||
|
// Parse the Set-Cookie header value and extract the cookies
|
||||||
|
|
||||||
|
String[] parts = value.split(";");
|
||||||
|
String cookie = parts[0].trim();
|
||||||
|
|
||||||
|
if (cookie.contains("=")) {
|
||||||
|
String[] cookieParts = cookie.split("=");
|
||||||
|
String name = cookieParts[0].trim();
|
||||||
|
String val = cookieParts[1].trim();
|
||||||
|
cookies.put(name, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void paintRequest(HttpUriRequestBase request) {
|
||||||
|
request.addHeader("Cookie", createCookieHeader());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void paintRequest(ClassicHttpRequest request) {
|
||||||
|
request.addHeader("Cookie", createCookieHeader());
|
||||||
|
}
|
||||||
|
|
||||||
|
private String createCookieHeader() {
|
||||||
|
StringJoiner sj = new StringJoiner("; ");
|
||||||
|
for (var cookie : cookies.entrySet()) {
|
||||||
|
sj.add(cookie.getKey() + "=" + cookie.getValue());
|
||||||
|
}
|
||||||
|
return sj.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -8,6 +8,7 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
|
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@@ -15,20 +16,17 @@ import java.util.List;
|
|||||||
public interface HttpFetcher extends AutoCloseable {
|
public interface HttpFetcher extends AutoCloseable {
|
||||||
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
||||||
|
|
||||||
Cookies getCookies();
|
CookieStore getCookies();
|
||||||
void clearCookies();
|
void clearCookies();
|
||||||
|
|
||||||
DomainProbeResult probeDomain(EdgeUrl url);
|
DomainProbeResult probeDomain(EdgeUrl url);
|
||||||
|
|
||||||
ContentTypeProbeResult probeContentType(
|
|
||||||
EdgeUrl url,
|
|
||||||
WarcRecorder recorder,
|
|
||||||
ContentTags tags) throws HttpFetcherImpl.RateLimitException;
|
|
||||||
|
|
||||||
HttpFetchResult fetchContent(EdgeUrl url,
|
HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
WarcRecorder recorder,
|
WarcRecorder recorder,
|
||||||
|
DomainCookies cookies,
|
||||||
|
CrawlDelayTimer timer,
|
||||||
ContentTags tags,
|
ContentTags tags,
|
||||||
ProbeType probeType) throws Exception;
|
ProbeType probeType);
|
||||||
|
|
||||||
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
||||||
|
|
||||||
@@ -46,6 +44,7 @@ public interface HttpFetcher extends AutoCloseable {
|
|||||||
|
|
||||||
/** This domain redirects to another domain */
|
/** This domain redirects to another domain */
|
||||||
record Redirect(EdgeDomain domain) implements DomainProbeResult {}
|
record Redirect(EdgeDomain domain) implements DomainProbeResult {}
|
||||||
|
record RedirectSameDomain_Internal(EdgeUrl domain) implements DomainProbeResult {}
|
||||||
|
|
||||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||||
@@ -56,7 +55,10 @@ public interface HttpFetcher extends AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
sealed interface ContentTypeProbeResult {
|
sealed interface ContentTypeProbeResult {
|
||||||
|
record NoOp() implements ContentTypeProbeResult {}
|
||||||
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
|
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
|
||||||
|
record HttpError(int statusCode, String message) implements ContentTypeProbeResult { }
|
||||||
|
record Redirect(EdgeUrl location) implements ContentTypeProbeResult { }
|
||||||
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
|
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
|
||||||
record Timeout(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
record Timeout(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||||
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||||
|
@@ -5,67 +5,169 @@ import com.google.inject.Singleton;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
|
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||||
|
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
|
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||||
|
import org.apache.hc.client5.http.config.RequestConfig;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
|
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||||
|
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
|
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||||
|
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||||
|
import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy;
|
||||||
|
import org.apache.hc.core5.http.*;
|
||||||
|
import org.apache.hc.core5.http.io.HttpClientResponseHandler;
|
||||||
|
import org.apache.hc.core5.http.io.SocketConfig;
|
||||||
|
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||||
|
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||||
|
import org.apache.hc.core5.http.message.MessageSupport;
|
||||||
|
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||||
|
import org.apache.hc.core5.pool.PoolStats;
|
||||||
|
import org.apache.hc.core5.util.TimeValue;
|
||||||
|
import org.apache.hc.core5.util.Timeout;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.parser.Parser;
|
import org.jsoup.parser.Parser;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.slf4j.Marker;
|
||||||
|
import org.slf4j.MarkerFactory;
|
||||||
|
|
||||||
|
import javax.net.ssl.SSLContext;
|
||||||
|
import javax.net.ssl.SSLException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.net.SocketTimeoutException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
import java.net.UnknownHostException;
|
||||||
import java.net.http.HttpRequest;
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.net.http.HttpResponse;
|
|
||||||
import java.net.http.HttpTimeoutException;
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Semaphore;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class HttpFetcherImpl implements HttpFetcher {
|
public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final String userAgentString;
|
private final String userAgentString;
|
||||||
private final String userAgentIdentifier;
|
private final String userAgentIdentifier;
|
||||||
private final Cookies cookies = new Cookies();
|
|
||||||
|
private final CookieStore cookies = new BasicCookieStore();
|
||||||
|
|
||||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
private final Marker crawlerAuditMarker = MarkerFactory.getMarker("CRAWLER");
|
||||||
|
|
||||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
private final LinkParser linkParser = new LinkParser();
|
||||||
private final Duration probeTimeout = Duration.ofSeconds(30);
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final HttpClient client;
|
private final CloseableHttpClient client;
|
||||||
|
private PoolingHttpClientConnectionManager connectionManager;
|
||||||
|
|
||||||
private HttpClient createClient() {
|
public PoolStats getPoolStats() {
|
||||||
return HttpClient.newBuilder()
|
return connectionManager.getTotalStats();
|
||||||
.sslContext(NoSecuritySSL.buildSslContext())
|
}
|
||||||
.cookieHandler(cookies)
|
|
||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||||
.connectTimeout(Duration.ofSeconds(8))
|
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||||
.executor(Executors.newCachedThreadPool())
|
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||||
|
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||||
|
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||||
|
.setMaxConnPerRoute(2)
|
||||||
|
.setMaxConnTotal(5000)
|
||||||
|
.setDefaultConnectionConfig(connectionConfig)
|
||||||
|
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||||
|
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||||
|
.setSoTimeout(Timeout.ofSeconds(10))
|
||||||
|
.build()
|
||||||
|
);
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon(true).start(() -> {
|
||||||
|
try {
|
||||||
|
for (;;) {
|
||||||
|
TimeUnit.SECONDS.sleep(15);
|
||||||
|
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||||
|
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||||
|
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||||
|
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
return HttpClients.custom()
|
||||||
|
.setDefaultCookieStore(cookies)
|
||||||
|
.setConnectionManager(connectionManager)
|
||||||
|
.setRetryStrategy(this)
|
||||||
|
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||||
|
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||||
|
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||||
|
//
|
||||||
|
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||||
|
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||||
|
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||||
|
|
||||||
|
while (it.hasNext()) {
|
||||||
|
final HeaderElement he = it.next();
|
||||||
|
final String param = he.getName();
|
||||||
|
final String value = he.getValue();
|
||||||
|
|
||||||
|
if (value == null)
|
||||||
|
continue;
|
||||||
|
if (!"timeout".equalsIgnoreCase(param))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
long timeout = Long.parseLong(value);
|
||||||
|
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||||
|
return TimeValue.ofSeconds(timeout);
|
||||||
|
} catch (final NumberFormatException ignore) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return defaultValue;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.disableRedirectHandling()
|
||||||
|
.setDefaultRequestConfig(defaultRequestConfig)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Cookies getCookies() {
|
public CookieStore getCookies() {
|
||||||
return cookies;
|
return cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,19 +179,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
@Inject
|
@Inject
|
||||||
public HttpFetcherImpl(UserAgent userAgent)
|
public HttpFetcherImpl(UserAgent userAgent)
|
||||||
{
|
{
|
||||||
this.client = createClient();
|
try {
|
||||||
|
this.client = createClient();
|
||||||
|
} catch (NoSuchAlgorithmException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
this.userAgentString = userAgent.uaString();
|
this.userAgentString = userAgent.uaString();
|
||||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetcherImpl(String userAgent) {
|
public HttpFetcherImpl(String userAgent) {
|
||||||
this.client = createClient();
|
try {
|
||||||
|
this.client = createClient();
|
||||||
|
} catch (NoSuchAlgorithmException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
this.userAgentString = userAgent;
|
this.userAgentString = userAgent;
|
||||||
this.userAgentIdentifier = userAgent;
|
this.userAgentIdentifier = userAgent;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Not necessary in prod, but useful in test
|
// Not necessary in prod, but useful in test
|
||||||
public void close() {
|
public void close() throws IOException {
|
||||||
client.close();
|
client.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -102,34 +212,94 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public DomainProbeResult probeDomain(EdgeUrl url) {
|
public DomainProbeResult probeDomain(EdgeUrl url) {
|
||||||
HttpRequest head;
|
List<EdgeUrl> urls = new ArrayList<>();
|
||||||
try {
|
urls.add(url);
|
||||||
head = HttpRequest.newBuilder()
|
|
||||||
.HEAD()
|
|
||||||
.uri(url.asURI())
|
|
||||||
.header("User-agent", userAgentString)
|
|
||||||
.timeout(probeTimeout)
|
|
||||||
.build();
|
|
||||||
} catch (URISyntaxException e) {
|
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int tries = 0;; tries++) {
|
int redirects = 0;
|
||||||
|
AtomicBoolean tryGet = new AtomicBoolean(false);
|
||||||
|
|
||||||
|
while (!urls.isEmpty() && ++redirects < 5) {
|
||||||
|
ClassicHttpRequest request;
|
||||||
|
|
||||||
|
EdgeUrl topUrl = urls.removeFirst();
|
||||||
try {
|
try {
|
||||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
if (tryGet.get()) {
|
||||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
request = ClassicRequestBuilder.get(topUrl.asURI())
|
||||||
|
.addHeader("User-Agent", userAgentString)
|
||||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
.addHeader("Range", "bytes=0-255")
|
||||||
|
.build();
|
||||||
|
} else {
|
||||||
|
request = ClassicRequestBuilder.head(topUrl.asURI())
|
||||||
|
.addHeader("User-Agent", userAgentString)
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
return new DomainProbeResult.Ok(rspUri);
|
} catch (URISyntaxException e) {
|
||||||
} catch (Exception ex) {
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||||
if (tries > 3) {
|
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
|
||||||
}
|
|
||||||
// else try again ...
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
var result = SendLock.wrapSend(client, request, response -> {
|
||||||
|
EntityUtils.consume(response.getEntity());
|
||||||
|
|
||||||
|
return switch (response.getCode()) {
|
||||||
|
case 200 -> new DomainProbeResult.Ok(url);
|
||||||
|
case 405 -> {
|
||||||
|
if (!tryGet.get()) {
|
||||||
|
tryGet.set(true);
|
||||||
|
yield new DomainProbeResult.RedirectSameDomain_Internal(url);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status 405, tried HEAD and GET?!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case 301, 302, 307 -> {
|
||||||
|
var location = response.getFirstHeader("Location");
|
||||||
|
|
||||||
|
if (location != null) {
|
||||||
|
Optional<EdgeUrl> newUrl = linkParser.parseLink(topUrl, location.getValue());
|
||||||
|
if (newUrl.isEmpty()) {
|
||||||
|
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid location header on redirect");
|
||||||
|
}
|
||||||
|
EdgeUrl newEdgeUrl = newUrl.get();
|
||||||
|
if (newEdgeUrl.domain.equals(topUrl.domain)) {
|
||||||
|
yield new DomainProbeResult.RedirectSameDomain_Internal(newEdgeUrl);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
yield new DomainProbeResult.Redirect(newEdgeUrl.domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "No location header on redirect");
|
||||||
|
|
||||||
|
}
|
||||||
|
default ->
|
||||||
|
new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status " + response.getCode());
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
if (result instanceof DomainProbeResult.RedirectSameDomain_Internal(EdgeUrl redirUrl)) {
|
||||||
|
urls.add(redirUrl);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't have robots.txt yet, so we'll assume a request delay of 1 second
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
}
|
||||||
|
catch (SocketTimeoutException ex) {
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe");
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Failed to resolve domain root");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Perform a HEAD request to fetch the content type of a URL.
|
/** Perform a HEAD request to fetch the content type of a URL.
|
||||||
@@ -140,70 +310,73 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
* recorded in the WARC file on failure.
|
* recorded in the WARC file on failure.
|
||||||
*/
|
*/
|
||||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||||
WarcRecorder warcRecorder,
|
DomainCookies cookies,
|
||||||
ContentTags tags) throws RateLimitException {
|
CrawlDelayTimer timer,
|
||||||
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
ContentTags tags) {
|
||||||
|
if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
try {
|
return new ContentTypeProbeResult.NoOp();
|
||||||
var headBuilder = HttpRequest.newBuilder()
|
}
|
||||||
.HEAD()
|
|
||||||
.uri(url.asURI())
|
try {
|
||||||
.header("User-Agent", userAgentString)
|
ClassicHttpRequest head = ClassicRequestBuilder.head(url.asURI())
|
||||||
.header("Accept-Encoding", "gzip")
|
.addHeader("User-Agent", userAgentString)
|
||||||
.timeout(requestTimeout)
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
;
|
.build();
|
||||||
|
|
||||||
var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
cookies.paintRequest(head);
|
||||||
var headers = rsp.headers();
|
|
||||||
|
return SendLock.wrapSend(client, head, (rsp) -> {
|
||||||
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
cookies.updateCookieStore(rsp);
|
||||||
|
EntityUtils.consume(rsp.getEntity());
|
||||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
int statusCode = rsp.getCode();
|
||||||
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
|
|
||||||
|
// Handle redirects
|
||||||
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
|
if (statusCode == 301 || statusCode == 302 || statusCode == 307) {
|
||||||
}
|
var location = rsp.getFirstHeader("Location");
|
||||||
|
if (location != null) {
|
||||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
Optional<EdgeUrl> newUrl = linkParser.parseLink(url, location.getValue());
|
||||||
|
if (newUrl.isEmpty())
|
||||||
// HEAD 301 url1 -> url2
|
return new ContentTypeProbeResult.HttpError(statusCode, "Invalid location header on redirect");
|
||||||
// HEAD 200 url2
|
return new ContentTypeProbeResult.Redirect(newUrl.get());
|
||||||
// GET 301 url1 -> url2
|
}
|
||||||
// GET 200 url2
|
}
|
||||||
|
|
||||||
// which is not what we want. Overall we want to do as few requests as possible to not raise
|
if (statusCode == 405) {
|
||||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
// If we get a 405, we can't probe the content type with HEAD, so we'll just say it's ok
|
||||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
return new ContentTypeProbeResult.Ok(url);
|
||||||
|
}
|
||||||
var redirectUrl = new EdgeUrl(rsp.uri());
|
|
||||||
EdgeUrl ret;
|
// Handle errors
|
||||||
|
if (statusCode < 200 || statusCode > 300) {
|
||||||
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
return new ContentTypeProbeResult.HttpError(statusCode, "Bad status code");
|
||||||
else ret = url;
|
}
|
||||||
|
|
||||||
// Intercept rate limiting
|
// Handle missing content type
|
||||||
if (rsp.statusCode() == 429) {
|
var ctHeader = rsp.getFirstHeader("Content-Type");
|
||||||
throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
|
if (ctHeader == null) {
|
||||||
}
|
return new ContentTypeProbeResult.HttpError(statusCode, "Missing Content-Type header");
|
||||||
|
}
|
||||||
return new ContentTypeProbeResult.Ok(ret);
|
var contentType = ctHeader.getValue();
|
||||||
}
|
|
||||||
catch (HttpTimeoutException ex) {
|
// Check if the content type is allowed
|
||||||
warcRecorder.flagAsTimeout(url);
|
if (contentTypeLogic.isAllowableContentType(contentType)) {
|
||||||
return new ContentTypeProbeResult.Timeout(ex);
|
return new ContentTypeProbeResult.Ok(url);
|
||||||
}
|
} else {
|
||||||
catch (RateLimitException ex) {
|
return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
|
||||||
throw ex;
|
}
|
||||||
}
|
});
|
||||||
catch (Exception ex) {
|
}
|
||||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
catch (SocketTimeoutException ex) {
|
||||||
|
|
||||||
warcRecorder.flagAsError(url, ex);
|
return new ContentTypeProbeResult.Timeout(ex);
|
||||||
|
}
|
||||||
return new ContentTypeProbeResult.Exception(ex);
|
catch (Exception ex) {
|
||||||
}
|
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
return new ContentTypeProbeResult.Exception(ex);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
timer.waitFetchDelay();
|
||||||
}
|
}
|
||||||
return new ContentTypeProbeResult.Ok(url);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Fetch the content of a URL, and record it in a WARC file,
|
/** Fetch the content of a URL, and record it in a WARC file,
|
||||||
@@ -213,37 +386,87 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
@Override
|
@Override
|
||||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
WarcRecorder warcRecorder,
|
WarcRecorder warcRecorder,
|
||||||
|
DomainCookies cookies,
|
||||||
|
CrawlDelayTimer timer,
|
||||||
ContentTags contentTags,
|
ContentTags contentTags,
|
||||||
ProbeType probeType)
|
ProbeType probeType)
|
||||||
throws Exception
|
|
||||||
{
|
{
|
||||||
var getBuilder = HttpRequest.newBuilder()
|
try {
|
||||||
.GET()
|
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||||
.uri(url.asURI())
|
try {
|
||||||
.header("User-Agent", userAgentString)
|
var probeResult = probeContentType(url, cookies, timer, contentTags);
|
||||||
.header("Accept-Encoding", "gzip")
|
|
||||||
.header("Accept-Language", "en,*;q=0.5")
|
|
||||||
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
|
||||||
.timeout(requestTimeout)
|
|
||||||
;
|
|
||||||
|
|
||||||
contentTags.paint(getBuilder);
|
switch (probeResult) {
|
||||||
|
case HttpFetcher.ContentTypeProbeResult.NoOp():
|
||||||
|
break; //
|
||||||
|
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||||
|
logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
|
||||||
|
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||||
|
break;
|
||||||
|
case ContentTypeProbeResult.BadContentType badContentType:
|
||||||
|
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
|
||||||
|
logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
|
||||||
|
return new HttpFetchResult.ResultNone();
|
||||||
|
case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
|
||||||
|
logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
|
||||||
|
warcRecorder.flagAsTimeout(url);
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
case ContentTypeProbeResult.Exception(Exception ex):
|
||||||
|
logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
|
||||||
|
warcRecorder.flagAsError(url, ex);
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
case ContentTypeProbeResult.HttpError httpError:
|
||||||
|
logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
|
||||||
|
return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
|
||||||
|
case ContentTypeProbeResult.Redirect redirect:
|
||||||
|
logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
|
||||||
|
return new HttpFetchResult.ResultRedirect(redirect.location());
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.warn("Failed to fetch {}", url, ex);
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
|
||||||
|
|
||||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
|
||||||
if (ok.statusCode() == 429) {
|
|
||||||
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
|
|
||||||
}
|
}
|
||||||
if (ok.statusCode() == 304) {
|
|
||||||
return new HttpFetchResult.Result304Raw();
|
HttpGet request = new HttpGet(url.asURI());
|
||||||
}
|
request.addHeader("User-Agent", userAgentString);
|
||||||
if (ok.statusCode() == 200) {
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
return ok;
|
request.addHeader("Accept-Language", "en,*;q=0.5");
|
||||||
|
request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||||
|
|
||||||
|
contentTags.paint(request);
|
||||||
|
|
||||||
|
try (var sl = new SendLock()) {
|
||||||
|
Instant start = Instant.now();
|
||||||
|
HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
|
||||||
|
|
||||||
|
Duration fetchDuration = Duration.between(start, Instant.now());
|
||||||
|
|
||||||
|
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||||
|
if (ok.statusCode() == 304) {
|
||||||
|
result = new HttpFetchResult.Result304Raw();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (result) {
|
||||||
|
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
|
||||||
|
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
|
||||||
|
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||||
|
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
|
||||||
|
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
|
||||||
|
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -307,62 +530,66 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
|
||||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
|
||||||
.GET()
|
|
||||||
.uri(sitemapUrl.asURI())
|
|
||||||
.header("Accept-Encoding", "gzip")
|
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
|
||||||
.header("User-Agent", userAgentString)
|
|
||||||
.timeout(requestTimeout)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
getRequest.addHeader("User-Agent", userAgentString);
|
||||||
if (response.statusCode() != 200) {
|
getRequest.addHeader("Accept-Encoding", "gzip");
|
||||||
return new SitemapResult.SitemapError();
|
getRequest.addHeader("Accept", "text/*, */*;q=0.9");
|
||||||
|
getRequest.addHeader("User-Agent", userAgentString);
|
||||||
|
|
||||||
|
try (var sl = new SendLock()) {
|
||||||
|
return client.execute(getRequest, response -> {
|
||||||
|
try {
|
||||||
|
if (response.getCode() != 200) {
|
||||||
|
return new SitemapResult.SitemapError();
|
||||||
|
}
|
||||||
|
|
||||||
|
Document parsedSitemap = Jsoup.parse(
|
||||||
|
EntityUtils.toString(response.getEntity()),
|
||||||
|
sitemapUrl.toString(),
|
||||||
|
Parser.xmlParser()
|
||||||
|
);
|
||||||
|
|
||||||
|
if (parsedSitemap.childrenSize() == 0) {
|
||||||
|
return new SitemapResult.SitemapError();
|
||||||
|
}
|
||||||
|
|
||||||
|
String rootTagName = parsedSitemap.child(0).tagName();
|
||||||
|
|
||||||
|
return switch (rootTagName.toLowerCase()) {
|
||||||
|
case "sitemapindex" -> {
|
||||||
|
List<String> references = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||||
|
references.add(locTag.text().trim());
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||||
|
}
|
||||||
|
case "urlset" -> {
|
||||||
|
List<String> urls = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||||
|
urls.add(locTag.text().trim());
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
|
}
|
||||||
|
case "rss", "atom" -> {
|
||||||
|
List<String> urls = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.select("link, url")) {
|
||||||
|
urls.add(locTag.text().trim());
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
|
}
|
||||||
|
default -> new SitemapResult.SitemapError();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
EntityUtils.consume(response.getEntity());
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
try (InputStream inputStream = response.body()) {
|
logger.warn("Error while fetching sitemap {}: {} ({})", sitemapUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
return new SitemapResult.SitemapError();
|
||||||
InputStream parserStream;
|
|
||||||
if (sitemapUrl.path.endsWith(".gz")) {
|
|
||||||
parserStream = new GZIPInputStream(inputStream);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
parserStream = inputStream;
|
|
||||||
}
|
|
||||||
|
|
||||||
Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
|
||||||
if (parsedSitemap.childrenSize() == 0) {
|
|
||||||
return new SitemapResult.SitemapError();
|
|
||||||
}
|
|
||||||
|
|
||||||
String rootTagName = parsedSitemap.child(0).tagName();
|
|
||||||
|
|
||||||
return switch (rootTagName.toLowerCase()) {
|
|
||||||
case "sitemapindex" -> {
|
|
||||||
List<String> references = new ArrayList<>();
|
|
||||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
|
||||||
references.add(locTag.text().trim());
|
|
||||||
}
|
|
||||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
|
||||||
}
|
|
||||||
case "urlset" -> {
|
|
||||||
List<String> urls = new ArrayList<>();
|
|
||||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
|
||||||
urls.add(locTag.text().trim());
|
|
||||||
}
|
|
||||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
|
||||||
}
|
|
||||||
case "rss", "atom" -> {
|
|
||||||
List<String> urls = new ArrayList<>();
|
|
||||||
for (var locTag : parsedSitemap.select("link, url")) {
|
|
||||||
urls.add(locTag.text().trim());
|
|
||||||
}
|
|
||||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
|
||||||
}
|
|
||||||
default -> new SitemapResult.SitemapError();
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -386,16 +613,14 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||||
try {
|
try (var sl = new SendLock()) {
|
||||||
var getRequest = HttpRequest.newBuilder()
|
|
||||||
.GET()
|
|
||||||
.uri(url.asURI())
|
|
||||||
.header("Accept-Encoding", "gzip")
|
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
|
||||||
.header("User-Agent", userAgentString)
|
|
||||||
.timeout(requestTimeout);
|
|
||||||
|
|
||||||
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
HttpGet request = new HttpGet(url.asURI());
|
||||||
|
request.addHeader("User-Agent", userAgentString);
|
||||||
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
|
request.addHeader("Accept", "text/*, */*;q=0.9");
|
||||||
|
|
||||||
|
HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);
|
||||||
|
|
||||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||||
robotsParser.parseContent(url.toString(),
|
robotsParser.parseContent(url.toString(),
|
||||||
@@ -409,6 +634,57 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||||
|
return switch (exception) {
|
||||||
|
case SocketTimeoutException ste -> false;
|
||||||
|
case SSLException ssle -> false;
|
||||||
|
case UnknownHostException uhe -> false;
|
||||||
|
default -> executionCount <= 3;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||||
|
return switch (response.getCode()) {
|
||||||
|
case 500, 503 -> executionCount <= 2;
|
||||||
|
case 429 -> executionCount <= 3;
|
||||||
|
default -> false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||||
|
return TimeValue.ofSeconds(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||||
|
|
||||||
|
int statusCode = response.getCode();
|
||||||
|
|
||||||
|
// Give 503 a bit more time
|
||||||
|
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||||
|
|
||||||
|
if (statusCode == 429) {
|
||||||
|
// get the Retry-After header
|
||||||
|
String retryAfter = response.getFirstHeader("Retry-After").getValue();
|
||||||
|
if (retryAfter == null) {
|
||||||
|
return TimeValue.ofSeconds(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||||
|
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||||
|
|
||||||
|
return TimeValue.ofSeconds(retryAfterTime);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return TimeValue.ofSeconds(2);
|
||||||
|
}
|
||||||
|
|
||||||
public static class RateLimitException extends Exception {
|
public static class RateLimitException extends Exception {
|
||||||
private final String retryAfter;
|
private final String retryAfter;
|
||||||
@@ -429,5 +705,31 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
class SendLock implements AutoCloseable {
|
||||||
|
|
||||||
|
private static final Semaphore maxConcurrentRequests = new Semaphore(Integer.getInteger("crawler.maxConcurrentRequests", 512));
|
||||||
|
boolean closed = false;
|
||||||
|
|
||||||
|
public SendLock() {
|
||||||
|
maxConcurrentRequests.acquireUninterruptibly();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T> T wrapSend(HttpClient client, final ClassicHttpRequest request,
|
||||||
|
final HttpClientResponseHandler<? extends T> responseHandler) throws IOException {
|
||||||
|
try (var lock = new SendLock()) {
|
||||||
|
return client.execute(request, responseHandler);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
if (!closed) {
|
||||||
|
maxConcurrentRequests.release();
|
||||||
|
closed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,15 +1,20 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
import org.apache.commons.io.input.BOMInputStream;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
|
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||||
|
import org.apache.hc.core5.http.Header;
|
||||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.net.http.HttpHeaders;
|
|
||||||
import java.net.http.HttpResponse;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Map;
|
import java.time.Duration;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.time.Instant;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import static nu.marginalia.crawl.fetcher.warc.ErrorBuffer.suppressContentEncoding;
|
||||||
|
|
||||||
/** Input buffer for temporary storage of a HTTP response
|
/** Input buffer for temporary storage of a HTTP response
|
||||||
* This may be in-memory or on-disk, at the discretion of
|
* This may be in-memory or on-disk, at the discretion of
|
||||||
@@ -17,9 +22,9 @@ import java.util.zip.GZIPInputStream;
|
|||||||
* */
|
* */
|
||||||
public abstract class WarcInputBuffer implements AutoCloseable {
|
public abstract class WarcInputBuffer implements AutoCloseable {
|
||||||
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||||
protected HttpHeaders headers;
|
protected Header[] headers;
|
||||||
|
|
||||||
WarcInputBuffer(HttpHeaders headers) {
|
WarcInputBuffer(Header[] headers) {
|
||||||
this.headers = headers;
|
this.headers = headers;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -31,7 +36,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
||||||
|
|
||||||
public final HttpHeaders headers() { return headers; }
|
public final Header[] headers() { return headers; }
|
||||||
|
|
||||||
/** Create a buffer for a response.
|
/** Create a buffer for a response.
|
||||||
* If the response is small and not compressed, it will be stored in memory.
|
* If the response is small and not compressed, it will be stored in memory.
|
||||||
@@ -39,34 +44,70 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
* and suppressed from the headers.
|
* and suppressed from the headers.
|
||||||
* If an error occurs, a buffer will be created with no content and an error status.
|
* If an error occurs, a buffer will be created with no content and an error status.
|
||||||
*/
|
*/
|
||||||
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
|
static WarcInputBuffer forResponse(ClassicHttpResponse response,
|
||||||
if (rsp == null)
|
HttpGet request,
|
||||||
|
Duration timeLimit) throws IOException {
|
||||||
|
if (response == null)
|
||||||
return new ErrorBuffer();
|
return new ErrorBuffer();
|
||||||
|
|
||||||
var headers = rsp.headers();
|
|
||||||
|
|
||||||
try (var is = rsp.body()) {
|
var entity = response.getEntity();
|
||||||
int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
|
|
||||||
String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
|
|
||||||
|
|
||||||
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
if (null == entity) {
|
||||||
|
return new ErrorBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
Instant start = Instant.now();
|
||||||
|
InputStream is = null;
|
||||||
|
try {
|
||||||
|
is = entity.getContent();
|
||||||
|
long length = entity.getContentLength();
|
||||||
|
|
||||||
|
if (length > 0 && length < 8192) {
|
||||||
// If the content is small and not compressed, we can just read it into memory
|
// If the content is small and not compressed, we can just read it into memory
|
||||||
return new MemoryBuffer(headers, is, contentLength);
|
return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
// Otherwise, we unpack it into a file and read it from there
|
// Otherwise, we unpack it into a file and read it from there
|
||||||
return new FileBuffer(headers, is);
|
return new FileBuffer(response.getHeaders(), request, timeLimit, is);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
finally {
|
||||||
return new ErrorBuffer();
|
// We're required to consume the stream to avoid leaking connections,
|
||||||
|
// but we also don't want to get stuck on slow or malicious connections
|
||||||
|
// forever, so we set a time limit on this phase and call abort() if it's exceeded.
|
||||||
|
try {
|
||||||
|
while (is != null) {
|
||||||
|
// Consume some data
|
||||||
|
if (is.skip(65536) == 0) {
|
||||||
|
// Note that skip may return 0 if the stream is empty
|
||||||
|
// or for other unspecified reasons, so we need to check
|
||||||
|
// with read() as well to determine if the stream is done
|
||||||
|
if (is.read() == -1)
|
||||||
|
is = null;
|
||||||
|
}
|
||||||
|
// Check if the time limit has been exceeded
|
||||||
|
else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
|
||||||
|
request.abort();
|
||||||
|
is = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
// Ignore the exception
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
// Close the input stream
|
||||||
|
IOUtils.closeQuietly(is);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
||||||
protected void copy(InputStream is, OutputStream os) {
|
protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
|
||||||
long startTime = System.currentTimeMillis();
|
Instant start = Instant.now();
|
||||||
|
Instant timeout = start.plus(timeLimit);
|
||||||
long size = 0;
|
long size = 0;
|
||||||
|
|
||||||
byte[] buffer = new byte[8192];
|
byte[] buffer = new byte[8192];
|
||||||
@@ -76,24 +117,105 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
|
Duration remaining = Duration.between(Instant.now(), timeout);
|
||||||
|
if (remaining.isNegative()) {
|
||||||
|
truncationReason = WarcTruncationReason.TIME;
|
||||||
|
// Abort the request if the time limit is exceeded
|
||||||
|
// so we don't keep the connection open forever or are forced to consume
|
||||||
|
// the stream to the end
|
||||||
|
|
||||||
|
request.abort();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
int n = is.read(buffer);
|
int n = is.read(buffer);
|
||||||
|
|
||||||
if (n < 0) break;
|
if (n < 0) break;
|
||||||
size += n;
|
size += n;
|
||||||
os.write(buffer, 0, n);
|
|
||||||
|
|
||||||
if (size > WarcRecorder.MAX_SIZE) {
|
// Even if we've exceeded the max length,
|
||||||
|
// we keep consuming the stream up until the end or a timeout,
|
||||||
|
// as closing the stream means resetting the connection, and
|
||||||
|
// that's generally not desirable.
|
||||||
|
|
||||||
|
if (size < WarcRecorder.MAX_SIZE) {
|
||||||
|
os.write(buffer, 0, n);
|
||||||
|
}
|
||||||
|
else if (truncationReason != WarcTruncationReason.LENGTH) {
|
||||||
truncationReason = WarcTruncationReason.LENGTH;
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (System.currentTimeMillis() - startTime > WarcRecorder.MAX_TIME) {
|
|
||||||
truncationReason = WarcTruncationReason.TIME;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Takes a Content-Range header and checks if it is complete.
|
||||||
|
* A complete range is one that covers the entire resource.
|
||||||
|
* For example, "bytes 0-1023/2048" or "bytes 0-1023/*" are complete ranges.
|
||||||
|
* "bytes 0-1023/2048" is not a complete range.
|
||||||
|
*/
|
||||||
|
public boolean isRangeComplete(Header[] headers) {
|
||||||
|
// Find the Content-Range header
|
||||||
|
String contentRangeHeader = null;
|
||||||
|
for (var header : headers) {
|
||||||
|
if ("Content-Range".equalsIgnoreCase(header.getName())) {
|
||||||
|
contentRangeHeader = header.getValue();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return true if header is null or empty
|
||||||
|
if (contentRangeHeader == null || contentRangeHeader.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Content-Range format: "bytes range-start-range-end/size"
|
||||||
|
// e.g., "bytes 0-1023/2048" or "bytes 0-1023/*"
|
||||||
|
|
||||||
|
// Get the part after "bytes "
|
||||||
|
String[] parts = contentRangeHeader.split(" ", 2);
|
||||||
|
if (parts.length < 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the range and size parts (e.g., "0-1023/2048")
|
||||||
|
String rangeAndSize = parts[1];
|
||||||
|
String[] rangeAndSizeParts = rangeAndSize.split("/", 2);
|
||||||
|
if (rangeAndSizeParts.length < 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the range (e.g., "0-1023")
|
||||||
|
String range = rangeAndSizeParts[0];
|
||||||
|
String[] rangeParts = range.split("-", 2);
|
||||||
|
if (rangeParts.length < 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the size (e.g., "2048" or "*")
|
||||||
|
String size = rangeAndSizeParts[1];
|
||||||
|
|
||||||
|
// If size is "*", we don't know the total size, so return false
|
||||||
|
if ("*".equals(size)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse as long to handle large files
|
||||||
|
long rangeStart = Long.parseLong(rangeParts[0]);
|
||||||
|
long rangeEnd = Long.parseLong(rangeParts[1]);
|
||||||
|
long totalSize = Long.parseLong(size);
|
||||||
|
|
||||||
|
// Check if the range covers the entire resource
|
||||||
|
return rangeStart == 0 && rangeEnd == totalSize - 1;
|
||||||
|
|
||||||
|
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -101,7 +223,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
/** Pseudo-buffer for when we have an error */
|
/** Pseudo-buffer for when we have an error */
|
||||||
class ErrorBuffer extends WarcInputBuffer {
|
class ErrorBuffer extends WarcInputBuffer {
|
||||||
public ErrorBuffer() {
|
public ErrorBuffer() {
|
||||||
super(HttpHeaders.of(Map.of(), (k,v)->false));
|
super(new Header[0]);
|
||||||
|
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
}
|
}
|
||||||
@@ -118,17 +240,29 @@ class ErrorBuffer extends WarcInputBuffer {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {}
|
public void close() throws Exception {}
|
||||||
|
|
||||||
|
|
||||||
|
static Header[] suppressContentEncoding(Header[] headers) {
|
||||||
|
return Arrays.stream(headers).filter(header -> !"Content-Encoding".equalsIgnoreCase(header.getName())).toArray(Header[]::new);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Buffer for when we have the response in memory */
|
/** Buffer for when we have the response in memory */
|
||||||
class MemoryBuffer extends WarcInputBuffer {
|
class MemoryBuffer extends WarcInputBuffer {
|
||||||
byte[] data;
|
byte[] data;
|
||||||
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
|
public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
|
||||||
super(headers);
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
|
if (!isRangeComplete(headers)) {
|
||||||
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
|
} else {
|
||||||
|
truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||||
|
}
|
||||||
|
|
||||||
var outputStream = new ByteArrayOutputStream(size);
|
var outputStream = new ByteArrayOutputStream(size);
|
||||||
|
|
||||||
copy(responseStream, outputStream);
|
copy(responseStream, request, outputStream, timeLimit);
|
||||||
|
|
||||||
data = outputStream.toByteArray();
|
data = outputStream.toByteArray();
|
||||||
}
|
}
|
||||||
@@ -152,40 +286,25 @@ class MemoryBuffer extends WarcInputBuffer {
|
|||||||
class FileBuffer extends WarcInputBuffer {
|
class FileBuffer extends WarcInputBuffer {
|
||||||
private final Path tempFile;
|
private final Path tempFile;
|
||||||
|
|
||||||
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
|
public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||||
super(suppressContentEncoding(headers));
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
|
if (!isRangeComplete(headers)) {
|
||||||
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
|
} else {
|
||||||
|
truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||||
|
}
|
||||||
|
|
||||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||||
|
|
||||||
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
copy(responseStream, request, out, timeLimit);
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
|
||||||
copy(new GZIPInputStream(responseStream), out);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
catch (Exception ex) {
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
copy(responseStream, out);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
|
|
||||||
return HttpHeaders.of(headers.map(), (k, v) -> {
|
|
||||||
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return !"Transfer-Encoding".equalsIgnoreCase(k);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public InputStream read() throws IOException {
|
public InputStream read() throws IOException {
|
||||||
return Files.newInputStream(tempFile);
|
return Files.newInputStream(tempFile);
|
||||||
}
|
}
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||||
|
import org.apache.hc.core5.http.Header;
|
||||||
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
@@ -8,6 +10,7 @@ import java.net.http.HttpClient;
|
|||||||
import java.net.http.HttpHeaders;
|
import java.net.http.HttpHeaders;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@@ -17,7 +20,7 @@ import java.util.stream.Collectors;
|
|||||||
public class WarcProtocolReconstructor {
|
public class WarcProtocolReconstructor {
|
||||||
|
|
||||||
static String getHttpRequestString(String method,
|
static String getHttpRequestString(String method,
|
||||||
Map<String, List<String>> mainHeaders,
|
Header[] mainHeaders,
|
||||||
Map<String, List<String>> extraHeaders,
|
Map<String, List<String>> extraHeaders,
|
||||||
URI uri) {
|
URI uri) {
|
||||||
StringBuilder requestStringBuilder = new StringBuilder();
|
StringBuilder requestStringBuilder = new StringBuilder();
|
||||||
@@ -34,12 +37,13 @@ public class WarcProtocolReconstructor {
|
|||||||
|
|
||||||
Set<String> addedHeaders = new HashSet<>();
|
Set<String> addedHeaders = new HashSet<>();
|
||||||
|
|
||||||
mainHeaders.forEach((k, values) -> {
|
for (var header : mainHeaders) {
|
||||||
for (var value : values) {
|
String k = header.getName();
|
||||||
addedHeaders.add(k);
|
String v = header.getValue();
|
||||||
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
|
|
||||||
}
|
addedHeaders.add(k);
|
||||||
});
|
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(v).append("\r\n");
|
||||||
|
}
|
||||||
|
|
||||||
extraHeaders.forEach((k, values) -> {
|
extraHeaders.forEach((k, values) -> {
|
||||||
if (!addedHeaders.contains(k)) {
|
if (!addedHeaders.contains(k)) {
|
||||||
@@ -87,6 +91,12 @@ public class WarcProtocolReconstructor {
|
|||||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
|
||||||
|
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
|
||||||
|
|
||||||
|
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||||
|
}
|
||||||
|
|
||||||
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
||||||
Map.entry(200, "OK"),
|
Map.entry(200, "OK"),
|
||||||
Map.entry(201, "Created"),
|
Map.entry(201, "Created"),
|
||||||
@@ -149,6 +159,39 @@ public class WarcProtocolReconstructor {
|
|||||||
return joiner.toString();
|
return joiner.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
|
||||||
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
|
for (var header : headers) {
|
||||||
|
String headerCapitalized = capitalizeHeader(header.getName());
|
||||||
|
|
||||||
|
// Omit pseudoheaders injected by the crawler itself
|
||||||
|
if (headerCapitalized.startsWith("X-Marginalia"))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Omit Transfer-Encoding and Content-Encoding headers
|
||||||
|
if (headerCapitalized.equals("Transfer-Encoding"))
|
||||||
|
continue;
|
||||||
|
if (headerCapitalized.equals("Content-Encoding"))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
|
||||||
|
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||||
|
// to reflect the actual size of the response body. We'll do this at the end.
|
||||||
|
if (headerCapitalized.equals("Content-Length"))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
|
||||||
|
joiner.add("Content-Length: " + responseSize);
|
||||||
|
|
||||||
|
return joiner.toString();
|
||||||
|
}
|
||||||
|
|
||||||
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
||||||
StringJoiner joiner = new StringJoiner("\r\n");
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
|
@@ -1,11 +1,16 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
|
import org.apache.hc.core5.http.NameValuePair;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -14,10 +19,9 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
|
import java.net.SocketTimeoutException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
|
||||||
import java.net.http.HttpResponse;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@@ -37,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
static final int MAX_TIME = 30_000;
|
static final int MAX_TIME = 30_000;
|
||||||
|
|
||||||
/** Maximum (decompressed) size we'll save */
|
/** Maximum (decompressed) size we'll save */
|
||||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
|
||||||
|
|
||||||
private final WarcWriter writer;
|
private final WarcWriter writer;
|
||||||
private final Path warcFile;
|
private final Path warcFile;
|
||||||
@@ -48,22 +52,15 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
// Affix a version string in case we need to change the format in the future
|
// Affix a version string in case we need to change the format in the future
|
||||||
// in some way
|
// in some way
|
||||||
private final String warcRecorderVersion = "1.0";
|
private final String warcRecorderVersion = "1.0";
|
||||||
private final Cookies cookies;
|
private final LinkParser linkParser = new LinkParser();
|
||||||
/**
|
/**
|
||||||
* Create a new WarcRecorder that will write to the given file
|
* Create a new WarcRecorder that will write to the given file
|
||||||
*
|
*
|
||||||
* @param warcFile The file to write to
|
* @param warcFile The file to write to
|
||||||
*/
|
*/
|
||||||
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
|
public WarcRecorder(Path warcFile) throws IOException {
|
||||||
this.warcFile = warcFile;
|
this.warcFile = warcFile;
|
||||||
this.writer = new WarcWriter(warcFile);
|
this.writer = new WarcWriter(warcFile);
|
||||||
this.cookies = fetcher.getCookies();
|
|
||||||
}
|
|
||||||
|
|
||||||
public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
|
|
||||||
this.warcFile = warcFile;
|
|
||||||
this.writer = new WarcWriter(warcFile);
|
|
||||||
this.cookies = cookies;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -73,138 +70,181 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
public WarcRecorder() throws IOException {
|
public WarcRecorder() throws IOException {
|
||||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||||
this.writer = new WarcWriter(this.warcFile);
|
this.writer = new WarcWriter(this.warcFile);
|
||||||
this.cookies = new Cookies();
|
|
||||||
|
|
||||||
temporaryFile = true;
|
temporaryFile = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetch(HttpClient client,
|
public HttpFetchResult fetch(HttpClient client,
|
||||||
java.net.http.HttpRequest request)
|
DomainCookies cookies,
|
||||||
|
HttpGet request)
|
||||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||||
{
|
{
|
||||||
URI requestUri = request.uri();
|
return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpFetchResult fetch(HttpClient client,
|
||||||
|
DomainCookies cookies,
|
||||||
|
HttpGet request,
|
||||||
|
Duration timeout)
|
||||||
|
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||||
|
{
|
||||||
|
URI requestUri = request.getUri();
|
||||||
|
|
||||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
Instant date = Instant.now();
|
Instant requestDate = Instant.now();
|
||||||
|
|
||||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.headers().map());
|
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||||
|
|
||||||
HttpResponse<InputStream> response;
|
// Inject a range header to attempt to limit the size of the response
|
||||||
|
// to the maximum size we want to store, if the server supports it.
|
||||||
|
request.addHeader("Range", "bytes=0-"+MAX_SIZE);
|
||||||
|
cookies.paintRequest(request);
|
||||||
try {
|
try {
|
||||||
response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
return client.execute(request,response -> {
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
InputStream inputStream = inputBuffer.read()) {
|
||||||
|
|
||||||
|
Instant responseDate = Instant.now();
|
||||||
|
|
||||||
|
cookies.updateCookieStore(response);
|
||||||
|
|
||||||
|
// Build and write the request
|
||||||
|
|
||||||
|
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
|
byte[] httpRequestString = WarcProtocolReconstructor
|
||||||
|
.getHttpRequestString(
|
||||||
|
request.getMethod(),
|
||||||
|
request.getHeaders(),
|
||||||
|
extraHeaders,
|
||||||
|
requestUri)
|
||||||
|
.getBytes();
|
||||||
|
|
||||||
|
requestDigestBuilder.update(httpRequestString);
|
||||||
|
|
||||||
|
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||||
|
.blockDigest(requestDigestBuilder.build())
|
||||||
|
.date(requestDate)
|
||||||
|
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
|
writer.write(warcRequest);
|
||||||
|
|
||||||
|
|
||||||
|
if (cookies.hasCookies()) {
|
||||||
|
response.addHeader("X-Has-Cookies", 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
|
||||||
|
Duration.between(requestDate, responseDate),
|
||||||
|
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||||
|
|
||||||
|
responseDataBuffer.put(responseHeaders);
|
||||||
|
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||||
|
|
||||||
|
int dataStart = responseDataBuffer.pos();
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
int remainingLength = responseDataBuffer.remaining();
|
||||||
|
if (remainingLength == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
int startPos = responseDataBuffer.pos();
|
||||||
|
|
||||||
|
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
||||||
|
if (n < 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
||||||
|
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
// with some http client libraries, that resolve redirects transparently, this might be different
|
||||||
|
// from the request URI, but currently we don't have transparent redirect resolution so it's always
|
||||||
|
// the same (though let's keep the variables separate in case this changes)
|
||||||
|
final URI responseUri = requestUri;
|
||||||
|
|
||||||
|
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||||
|
.blockDigest(responseDigestBuilder.build())
|
||||||
|
.date(responseDate)
|
||||||
|
.concurrentTo(warcRequest.id())
|
||||||
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
|
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||||
|
responseBuilder.ipAddress(inetAddress);
|
||||||
|
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||||
|
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||||
|
|
||||||
|
// Build and write the response
|
||||||
|
|
||||||
|
var warcResponse = responseBuilder.build();
|
||||||
|
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
|
writer.write(warcResponse);
|
||||||
|
|
||||||
|
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||||
|
&& inputBuffer.size() < 2048
|
||||||
|
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||||
|
{
|
||||||
|
// Fast detection and mitigation of crawler traps that respond with slow
|
||||||
|
// small responses, with a high branching factor
|
||||||
|
|
||||||
|
// Note we bail *after* writing the warc records, this will effectively only
|
||||||
|
// prevent link extraction from the document.
|
||||||
|
|
||||||
|
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||||
|
requestUri,
|
||||||
|
Duration.between(requestDate, Instant.now()).getSeconds(),
|
||||||
|
inputBuffer.size()
|
||||||
|
);
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (response.getCode() == 301 || response.getCode() == 302 || response.getCode() == 307) {
|
||||||
|
// If the server responds with a redirect, we need to
|
||||||
|
// update the request URI to the new location
|
||||||
|
EdgeUrl redirectLocation = Optional.ofNullable(response.getFirstHeader("Location"))
|
||||||
|
.map(NameValuePair::getValue)
|
||||||
|
.flatMap(location -> linkParser.parseLink(new EdgeUrl(requestUri), location))
|
||||||
|
.orElse(null);
|
||||||
|
if (redirectLocation != null) {
|
||||||
|
// If the redirect location is a valid URL, we need to update the request URI
|
||||||
|
return new HttpFetchResult.ResultRedirect(redirectLocation);
|
||||||
|
} else {
|
||||||
|
// If the redirect location is not a valid URL, we need to throw an exception
|
||||||
|
return new HttpFetchResult.ResultException(new IOException("Invalid redirect location: " + response.getFirstHeader("Location")));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultOk(responseUri,
|
||||||
|
response.getCode(),
|
||||||
|
inputBuffer.headers(),
|
||||||
|
inetAddress.getHostAddress(),
|
||||||
|
responseDataBuffer.data,
|
||||||
|
dataStart,
|
||||||
|
responseDataBuffer.length() - dataStart);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||||
|
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// the client.execute() method will throw an exception if the request times out
|
||||||
|
// or on other IO exceptions, so we need to catch those here as well as having
|
||||||
|
// exception handling in the response handler
|
||||||
|
} catch (SocketTimeoutException ex) {
|
||||||
|
flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout
|
||||||
return new HttpFetchResult.ResultException(ex);
|
return new HttpFetchResult.ResultException(ex);
|
||||||
}
|
} catch (IOException ex) {
|
||||||
|
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||||
|
|
||||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response);
|
|
||||||
InputStream inputStream = inputBuffer.read())
|
|
||||||
{
|
|
||||||
if (cookies.hasCookies()) {
|
|
||||||
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
|
||||||
}
|
|
||||||
|
|
||||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
|
||||||
|
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
|
||||||
|
|
||||||
responseDataBuffer.put(responseHeaders);
|
|
||||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
|
||||||
|
|
||||||
int dataStart = responseDataBuffer.pos();
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
int remainingLength = responseDataBuffer.remaining();
|
|
||||||
if (remainingLength == 0)
|
|
||||||
break;
|
|
||||||
|
|
||||||
int startPos = responseDataBuffer.pos();
|
|
||||||
|
|
||||||
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
|
||||||
if (n < 0)
|
|
||||||
break;
|
|
||||||
|
|
||||||
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
|
||||||
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
// It looks like this might be the same as requestUri, but it's not;
|
|
||||||
// it's the URI after resolving redirects.
|
|
||||||
final URI responseUri = response.uri();
|
|
||||||
|
|
||||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
|
||||||
.blockDigest(responseDigestBuilder.build())
|
|
||||||
.date(date)
|
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
|
||||||
|
|
||||||
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
|
||||||
responseBuilder.ipAddress(inetAddress);
|
|
||||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
|
||||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
|
||||||
|
|
||||||
// Build and write the response
|
|
||||||
|
|
||||||
var warcResponse = responseBuilder.build();
|
|
||||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
|
||||||
writer.write(warcResponse);
|
|
||||||
|
|
||||||
// Build and write the request
|
|
||||||
|
|
||||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
|
||||||
|
|
||||||
byte[] httpRequestString = WarcProtocolReconstructor
|
|
||||||
.getHttpRequestString(
|
|
||||||
response.request().method(),
|
|
||||||
response.request().headers().map(),
|
|
||||||
extraHeaders,
|
|
||||||
requestUri)
|
|
||||||
.getBytes();
|
|
||||||
|
|
||||||
requestDigestBuilder.update(httpRequestString);
|
|
||||||
|
|
||||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
|
||||||
.blockDigest(requestDigestBuilder.build())
|
|
||||||
.date(date)
|
|
||||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
|
||||||
.concurrentTo(warcResponse.id())
|
|
||||||
.build();
|
|
||||||
|
|
||||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
|
||||||
writer.write(warcRequest);
|
|
||||||
|
|
||||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
|
||||||
&& inputBuffer.size() < 2048
|
|
||||||
&& !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
|
||||||
{
|
|
||||||
// Fast detection and mitigation of crawler traps that respond with slow
|
|
||||||
// small responses, with a high branching factor
|
|
||||||
|
|
||||||
// Note we bail *after* writing the warc records, this will effectively only
|
|
||||||
// prevent link extraction from the document.
|
|
||||||
|
|
||||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
|
||||||
requestUri,
|
|
||||||
Duration.between(date, Instant.now()).getSeconds(),
|
|
||||||
inputBuffer.size()
|
|
||||||
);
|
|
||||||
|
|
||||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
|
||||||
}
|
|
||||||
|
|
||||||
return new HttpFetchResult.ResultOk(responseUri,
|
|
||||||
response.statusCode(),
|
|
||||||
inputBuffer.headers(),
|
|
||||||
inetAddress.getHostAddress(),
|
|
||||||
responseDataBuffer.data,
|
|
||||||
dataStart,
|
|
||||||
responseDataBuffer.length() - dataStart);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||||
return new HttpFetchResult.ResultException(ex);
|
return new HttpFetchResult.ResultException(ex);
|
||||||
}
|
}
|
||||||
@@ -214,7 +254,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
writer.write(item);
|
writer.write(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||||
try {
|
try {
|
||||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
@@ -275,7 +315,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
.date(Instant.now())
|
.date(Instant.now())
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
if (cookies.hasCookies()) {
|
if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
|
||||||
builder.addHeader("X-Has-Cookies", "1");
|
builder.addHeader("X-Has-Cookies", "1");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -295,8 +335,8 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||||
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||||
*/
|
*/
|
||||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||||
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
|
saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
|
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
|
||||||
@@ -316,6 +356,9 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
case HttpFetcherImpl.DomainProbeResult.Ok ok:
|
case HttpFetcherImpl.DomainProbeResult.Ok ok:
|
||||||
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
||||||
break;
|
break;
|
||||||
|
case HttpFetcher.DomainProbeResult.RedirectSameDomain_Internal redirectSameDomain:
|
||||||
|
fields.put("X-WARC-Probe-Status", List.of("REDIR-INTERNAL"));
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
var warcinfo = new Warcinfo.Builder()
|
var warcinfo = new Warcinfo.Builder()
|
||||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.Semaphore;
|
import java.util.concurrent.Semaphore;
|
||||||
|
|
||||||
@@ -19,8 +20,22 @@ public class DomainLocks {
|
|||||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||||
*/
|
*/
|
||||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||||
return new DomainLock(domain.toString(),
|
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
|
||||||
|
sem.acquire();
|
||||||
|
|
||||||
|
return new DomainLock(sem);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
|
||||||
|
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||||
|
if (sem.tryAcquire(1)) {
|
||||||
|
return Optional.of(new DomainLock(sem));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// We don't have a lock, so we return an empty optional
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Semaphore defaultPermits(String topDomain) {
|
private Semaphore defaultPermits(String topDomain) {
|
||||||
@@ -28,23 +43,27 @@ public class DomainLocks {
|
|||||||
return new Semaphore(16);
|
return new Semaphore(16);
|
||||||
if (topDomain.equals("blogspot.com"))
|
if (topDomain.equals("blogspot.com"))
|
||||||
return new Semaphore(8);
|
return new Semaphore(8);
|
||||||
|
if (topDomain.equals("tumblr.com"))
|
||||||
|
return new Semaphore(8);
|
||||||
if (topDomain.equals("neocities.org"))
|
if (topDomain.equals("neocities.org"))
|
||||||
return new Semaphore(4);
|
return new Semaphore(8);
|
||||||
if (topDomain.equals("github.io"))
|
if (topDomain.equals("github.io"))
|
||||||
return new Semaphore(4);
|
return new Semaphore(8);
|
||||||
|
|
||||||
|
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||||
|
// to not get blocked.
|
||||||
if (topDomain.equals("substack.com")) {
|
if (topDomain.equals("substack.com")) {
|
||||||
return new Semaphore(1);
|
return new Semaphore(1);
|
||||||
}
|
}
|
||||||
if (topDomain.endsWith(".edu")) {
|
|
||||||
return new Semaphore(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Semaphore(2);
|
return new Semaphore(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean canLock(EdgeDomain domain) {
|
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||||
|
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
|
||||||
|
* after this method returns true)
|
||||||
|
*/
|
||||||
|
public boolean isLockableHint(EdgeDomain domain) {
|
||||||
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||||
if (null == sem)
|
if (null == sem)
|
||||||
return true;
|
return true;
|
||||||
@@ -53,22 +72,16 @@ public class DomainLocks {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class DomainLock implements AutoCloseable {
|
public static class DomainLock implements AutoCloseable {
|
||||||
private final String domainName;
|
|
||||||
private final Semaphore semaphore;
|
private final Semaphore semaphore;
|
||||||
|
|
||||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
DomainLock(Semaphore semaphore) {
|
||||||
this.domainName = domainName;
|
|
||||||
this.semaphore = semaphore;
|
this.semaphore = semaphore;
|
||||||
|
|
||||||
Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
|
|
||||||
semaphore.acquire();
|
|
||||||
Thread.currentThread().setName("crawling:" + domainName);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
semaphore.release();
|
semaphore.release();
|
||||||
Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
|
Thread.currentThread().setName("[idle]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
|
|||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
|
|
||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
@@ -50,15 +51,20 @@ public class CrawlDelayTimer {
|
|||||||
waitFetchDelay(0);
|
waitFetchDelay(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void waitFetchDelay(Duration spentTime) {
|
||||||
|
waitFetchDelay(spentTime.toMillis());
|
||||||
|
}
|
||||||
|
|
||||||
public void waitFetchDelay(long spentTime) {
|
public void waitFetchDelay(long spentTime) {
|
||||||
long sleepTime = delayTime;
|
long sleepTime = delayTime;
|
||||||
|
|
||||||
|
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
|
||||||
try {
|
try {
|
||||||
if (sleepTime >= 1) {
|
if (sleepTime >= 1) {
|
||||||
if (spentTime > sleepTime)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(min(sleepTime - spentTime, 5000));
|
Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
|
||||||
} else {
|
} else {
|
||||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||||
// within sane limits. This means slower servers get slower crawling, and faster
|
// within sane limits. This means slower servers get slower crawling, and faster
|
||||||
@@ -71,17 +77,17 @@ public class CrawlDelayTimer {
|
|||||||
if (spentTime > sleepTime)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(sleepTime - spentTime);
|
Thread.sleep(sleepTime - spentTime + jitter);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slowDown) {
|
if (slowDown) {
|
||||||
// Additional delay when the server is signalling it wants slower requests
|
// Additional delay when the server is signalling it wants slower requests
|
||||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
|
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (InterruptedException e) {
|
catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
throw new RuntimeException();
|
throw new RuntimeException("Interrupted", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,42 @@
|
|||||||
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.concurrent.Semaphore;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is used to stagger the rate at which connections are created.
|
||||||
|
* <p></p>
|
||||||
|
* It is used to ensure that we do not create too many connections at once,
|
||||||
|
* which can lead to network congestion and other issues. Since the connections
|
||||||
|
* tend to be very long-lived, we can afford to wait a bit before creating the next
|
||||||
|
* even if it adds a bit of build-up time when the crawl starts.
|
||||||
|
*/
|
||||||
|
public class CrawlerConnectionThrottle {
|
||||||
|
private Instant lastCrawlStart = Instant.EPOCH;
|
||||||
|
private final Semaphore launchSemaphore = new Semaphore(1);
|
||||||
|
|
||||||
|
private final Duration launchInterval;
|
||||||
|
|
||||||
|
public CrawlerConnectionThrottle(Duration launchInterval) {
|
||||||
|
this.launchInterval = launchInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void waitForConnectionPermission() throws InterruptedException {
|
||||||
|
try {
|
||||||
|
launchSemaphore.acquire();
|
||||||
|
Instant nextPermittedLaunch = lastCrawlStart.plus(launchInterval);
|
||||||
|
|
||||||
|
if (nextPermittedLaunch.isAfter(Instant.now())) {
|
||||||
|
long waitTime = Duration.between(Instant.now(), nextPermittedLaunch).toMillis();
|
||||||
|
TimeUnit.MILLISECONDS.sleep(waitTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCrawlStart = Instant.now();
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
launchSemaphore.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -6,8 +6,8 @@ import nu.marginalia.contenttype.ContentType;
|
|||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||||
@@ -26,14 +26,16 @@ import java.io.IOException;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class CrawlerRetreiver implements AutoCloseable {
|
public class CrawlerRetreiver implements AutoCloseable {
|
||||||
|
|
||||||
private static final int MAX_ERRORS = 20;
|
private static final int MAX_ERRORS = 20;
|
||||||
private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once
|
|
||||||
|
|
||||||
private final HttpFetcher fetcher;
|
private final HttpFetcher fetcher;
|
||||||
|
|
||||||
@@ -50,6 +52,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
private final DomainStateDb domainStateDb;
|
private final DomainStateDb domainStateDb;
|
||||||
private final WarcRecorder warcRecorder;
|
private final WarcRecorder warcRecorder;
|
||||||
private final CrawlerRevisitor crawlerRevisitor;
|
private final CrawlerRevisitor crawlerRevisitor;
|
||||||
|
private final DomainCookies cookies = new DomainCookies();
|
||||||
|
|
||||||
|
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
|
||||||
|
Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
|
||||||
|
);
|
||||||
|
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
|
|
||||||
@@ -90,6 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||||
try (oldCrawlData) {
|
try (oldCrawlData) {
|
||||||
|
|
||||||
|
// Wait for permission to open a connection to avoid network congestion
|
||||||
|
// from hundreds/thousands of TCP handshakes
|
||||||
|
connectionThrottle.waitForConnectionPermission();
|
||||||
|
|
||||||
// Do an initial domain probe to determine the root URL
|
// Do an initial domain probe to determine the root URL
|
||||||
var probeResult = probeRootUrl();
|
var probeResult = probeRootUrl();
|
||||||
|
|
||||||
@@ -108,15 +120,24 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||||
domainStateDb.save(summaryRecord);
|
domainStateDb.save(summaryRecord);
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
// There's a small chance we're interrupted during the sniffing portion
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
Instant recrawlStart = Instant.now();
|
||||||
|
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
|
||||||
|
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
if (recrawlMetadata.size() > 0) {
|
||||||
// If we have reference data, we will always grow the crawl depth a bit
|
// If we have reference data, we will always grow the crawl depth a bit
|
||||||
crawlFrontier.increaseDepth(1.5, 2500);
|
crawlFrontier.increaseDepth(1.5, 2500);
|
||||||
}
|
}
|
||||||
|
|
||||||
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
||||||
|
|
||||||
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
|
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
|
||||||
}
|
}
|
||||||
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||||
@@ -126,6 +147,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||||
yield 1;
|
yield 1;
|
||||||
}
|
}
|
||||||
|
default -> {
|
||||||
|
logger.error("Unexpected domain probe result {}", probeResult);
|
||||||
|
yield 1;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -138,17 +163,29 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
private int crawlDomain(EdgeUrl rootUrl,
|
private int crawlDomain(EdgeUrl rootUrl,
|
||||||
SimpleRobotRules robotsRules,
|
SimpleRobotRules robotsRules,
|
||||||
CrawlDelayTimer delayTimer,
|
CrawlDelayTimer delayTimer,
|
||||||
DomainLinks domainLinks) {
|
DomainLinks domainLinks,
|
||||||
|
CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
|
||||||
|
Duration recrawlTime) {
|
||||||
|
|
||||||
|
Instant crawlStart = Instant.now();
|
||||||
|
|
||||||
// Add external links to the crawl frontier
|
// Add external links to the crawl frontier
|
||||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||||
|
|
||||||
// Fetch sitemaps
|
// Fetch sitemaps
|
||||||
for (var sitemap : robotsRules.getSitemaps()) {
|
for (var sitemap : robotsRules.getSitemaps()) {
|
||||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
|
||||||
|
// Validate the sitemap URL and check if it belongs to the domain as the root URL
|
||||||
|
if (EdgeUrl.parse(sitemap)
|
||||||
|
.map(url -> url.getDomain().equals(rootUrl.domain))
|
||||||
|
.orElse(false)) {
|
||||||
|
|
||||||
|
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int crawlerAdditions = 0;
|
||||||
|
|
||||||
while (!crawlFrontier.isEmpty()
|
while (!crawlFrontier.isEmpty()
|
||||||
&& !crawlFrontier.isCrawlDepthReached()
|
&& !crawlFrontier.isCrawlDepthReached()
|
||||||
&& errorCount < MAX_ERRORS
|
&& errorCount < MAX_ERRORS
|
||||||
@@ -180,7 +217,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
||||||
|
|
||||||
|
if (result.isOk()) {
|
||||||
|
crawlerAdditions++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (InterruptedException ex) {
|
catch (InterruptedException ex) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
@@ -188,6 +229,17 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Duration crawlTime = Duration.between(crawlStart, Instant.now());
|
||||||
|
domainStateDb.save(new DomainStateDb.CrawlMeta(
|
||||||
|
domain,
|
||||||
|
Instant.now(),
|
||||||
|
recrawlTime,
|
||||||
|
crawlTime,
|
||||||
|
recrawlMetadata.errors(),
|
||||||
|
crawlerAdditions,
|
||||||
|
recrawlMetadata.size() + crawlerAdditions
|
||||||
|
));
|
||||||
|
|
||||||
return crawlFrontier.visitedSize();
|
return crawlFrontier.visitedSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -216,17 +268,29 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
return domainProbeResult;
|
return domainProbeResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||||
Optional<String> feedLink = Optional.empty();
|
Optional<String> feedLink = Optional.empty();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var url = rootUrl.withPathAndParam("/", null);
|
var url = rootUrl.withPathAndParam("/", null);
|
||||||
|
|
||||||
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
|
||||||
|
if (Objects.equals(location.domain, url.domain)) {
|
||||||
|
// TODO: Follow the redirect to the new location and sniff the document
|
||||||
|
crawlFrontier.addFirst(location);
|
||||||
|
}
|
||||||
|
|
||||||
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||||
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
}
|
||||||
|
|
||||||
var optDoc = ok.parseDocument();
|
var optDoc = ok.parseDocument();
|
||||||
if (optDoc.isEmpty())
|
if (optDoc.isEmpty())
|
||||||
@@ -275,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
// Grab the favicon if it exists
|
// Grab the favicon if it exists
|
||||||
|
|
||||||
if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
|
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||||
String contentType = iconResult.header("Content-Type");
|
String contentType = iconResult.header("Content-Type");
|
||||||
byte[] iconData = iconResult.getBodyBytes();
|
byte[] iconData = iconResult.getBodyBytes();
|
||||||
|
|
||||||
@@ -289,6 +353,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error configuring link filter", ex);
|
logger.error("Error configuring link filter", ex);
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
return DomainStateDb.SummaryRecord.forError(domain, "Crawler Interrupted", ex.getMessage());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
crawlFrontier.addVisited(rootUrl);
|
crawlFrontier.addVisited(rootUrl);
|
||||||
@@ -316,7 +384,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
);
|
);
|
||||||
|
|
||||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||||
var oldDomainStateRecord = domainStateDb.get(domain);
|
var oldDomainStateRecord = domainStateDb.getSummary(domain);
|
||||||
|
|
||||||
// If we are already aware of an old feed URL, then we can just revalidate it
|
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||||
if (oldDomainStateRecord.isPresent()) {
|
if (oldDomainStateRecord.isPresent()) {
|
||||||
@@ -341,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
if (parsedOpt.isEmpty())
|
if (parsedOpt.isEmpty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||||
@@ -367,112 +435,63 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
CrawlDelayTimer timer,
|
CrawlDelayTimer timer,
|
||||||
DocumentWithReference reference) throws InterruptedException
|
DocumentWithReference reference) throws InterruptedException
|
||||||
{
|
{
|
||||||
logger.debug("Fetching {}", top);
|
|
||||||
|
|
||||||
long startTime = System.currentTimeMillis();
|
|
||||||
var contentTags = reference.getContentTags();
|
var contentTags = reference.getContentTags();
|
||||||
|
|
||||||
HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
|
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
|
||||||
|
timer.waitFetchDelay();
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the document and enqueue links
|
// Parse the document and enqueue links
|
||||||
try {
|
try {
|
||||||
if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
|
switch (fetchedDoc) {
|
||||||
var docOpt = ok.parseDocument();
|
case HttpFetchResult.ResultOk ok -> {
|
||||||
if (docOpt.isPresent()) {
|
var docOpt = ok.parseDocument();
|
||||||
var doc = docOpt.get();
|
if (docOpt.isPresent()) {
|
||||||
|
var doc = docOpt.get();
|
||||||
|
|
||||||
var responseUrl = new EdgeUrl(ok.uri());
|
var responseUrl = new EdgeUrl(ok.uri());
|
||||||
|
|
||||||
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||||
crawlFrontier.addVisited(responseUrl);
|
crawlFrontier.addVisited(responseUrl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
case HttpFetchResult.Result304Raw ref when reference.doc() != null ->
|
||||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
{
|
||||||
var doc = reference.doc();
|
var doc = reference.doc();
|
||||||
|
|
||||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||||
|
|
||||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||||
new ContentType(doc.contentType, "UTF-8"),
|
new ContentType(doc.contentType, "UTF-8"),
|
||||||
doc.documentBodyBytes);
|
doc.documentBodyBytes);
|
||||||
|
|
||||||
if (doc.documentBodyBytes != null) {
|
if (doc.documentBodyBytes != null) {
|
||||||
var parsed = doc.parseBody();
|
var parsed = doc.parseBody();
|
||||||
|
|
||||||
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
||||||
crawlFrontier.addVisited(top);
|
crawlFrontier.addVisited(top);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
case HttpFetchResult.ResultRedirect(EdgeUrl location) -> {
|
||||||
else if (fetchedDoc instanceof HttpFetchResult.ResultException) {
|
if (Objects.equals(location.domain, top.domain)) {
|
||||||
errorCount ++;
|
crawlFrontier.addFirst(location);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case HttpFetchResult.ResultException ex -> errorCount++;
|
||||||
|
default -> {} // Ignore other types
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error parsing document {}", top, ex);
|
logger.error("Error parsing document {}", top, ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
timer.waitFetchDelay(System.currentTimeMillis() - startTime);
|
|
||||||
|
|
||||||
return fetchedDoc;
|
return fetchedDoc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Fetch a document and retry on 429s */
|
|
||||||
private HttpFetchResult fetchWithRetry(EdgeUrl url,
|
|
||||||
CrawlDelayTimer timer,
|
|
||||||
HttpFetcher.ProbeType probeType,
|
|
||||||
ContentTags contentTags) throws InterruptedException {
|
|
||||||
|
|
||||||
long probeStart = System.currentTimeMillis();
|
|
||||||
|
|
||||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
|
||||||
retryLoop:
|
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
|
||||||
try {
|
|
||||||
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
|
|
||||||
|
|
||||||
switch (probeResult) {
|
|
||||||
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
|
||||||
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
|
||||||
break retryLoop;
|
|
||||||
case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
|
|
||||||
return new HttpFetchResult.ResultNone();
|
|
||||||
case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
|
|
||||||
return new HttpFetchResult.ResultException(timeout.ex());
|
|
||||||
case HttpFetcher.ContentTypeProbeResult.Exception exception:
|
|
||||||
return new HttpFetchResult.ResultException(exception.ex());
|
|
||||||
default: // should be unreachable
|
|
||||||
throw new IllegalStateException("Unknown probe result");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
|
||||||
timer.waitRetryDelay(ex);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to fetch {}", url, ex);
|
|
||||||
return new HttpFetchResult.ResultException(ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
|
||||||
try {
|
|
||||||
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
|
||||||
}
|
|
||||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
|
||||||
timer.waitRetryDelay(ex);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to fetch {}", url, ex);
|
|
||||||
return new HttpFetchResult.ResultException(ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new HttpFetchResult.ResultNone();
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isAllowedProtocol(String proto) {
|
private boolean isAllowedProtocol(String proto) {
|
||||||
return proto.equalsIgnoreCase("http")
|
return proto.equalsIgnoreCase("http")
|
||||||
|| proto.equalsIgnoreCase("https");
|
|| proto.equalsIgnoreCase("https");
|
||||||
|
@@ -55,6 +55,9 @@ public class DomainCrawlFrontier {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public EdgeDomain getDomain() {
|
||||||
|
return thisDomain;
|
||||||
|
}
|
||||||
/** Increase the depth of the crawl by a factor. If the current depth is smaller
|
/** Increase the depth of the crawl by a factor. If the current depth is smaller
|
||||||
* than the number of already visited documents, the base depth will be adjusted
|
* than the number of already visited documents, the base depth will be adjusted
|
||||||
* to the visited count first.
|
* to the visited count first.
|
||||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;
|
|||||||
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
@@ -10,6 +11,8 @@ import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@@ -18,10 +21,13 @@ import java.io.IOException;
|
|||||||
* E-Tag and Last-Modified headers.
|
* E-Tag and Last-Modified headers.
|
||||||
*/
|
*/
|
||||||
public class CrawlerRevisitor {
|
public class CrawlerRevisitor {
|
||||||
|
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
private final CrawlerRetreiver crawlerRetreiver;
|
private final CrawlerRetreiver crawlerRetreiver;
|
||||||
private final WarcRecorder warcRecorder;
|
private final WarcRecorder warcRecorder;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerRevisitor.class);
|
||||||
|
|
||||||
public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
|
public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
|
||||||
CrawlerRetreiver crawlerRetreiver,
|
CrawlerRetreiver crawlerRetreiver,
|
||||||
WarcRecorder warcRecorder) {
|
WarcRecorder warcRecorder) {
|
||||||
@@ -31,7 +37,8 @@ public class CrawlerRevisitor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||||
public int recrawl(CrawlDataReference oldCrawlData,
|
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
|
||||||
|
DomainCookies cookies,
|
||||||
SimpleRobotRules robotsRules,
|
SimpleRobotRules robotsRules,
|
||||||
CrawlDelayTimer delayTimer)
|
CrawlDelayTimer delayTimer)
|
||||||
throws InterruptedException {
|
throws InterruptedException {
|
||||||
@@ -39,6 +46,7 @@ public class CrawlerRevisitor {
|
|||||||
int retained = 0;
|
int retained = 0;
|
||||||
int errors = 0;
|
int errors = 0;
|
||||||
int skipped = 0;
|
int skipped = 0;
|
||||||
|
int size = 0;
|
||||||
|
|
||||||
for (CrawledDocument doc : oldCrawlData) {
|
for (CrawledDocument doc : oldCrawlData) {
|
||||||
if (errors > 20) {
|
if (errors > 20) {
|
||||||
@@ -46,6 +54,10 @@ public class CrawlerRevisitor {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||||
if (urlMaybe.isEmpty())
|
if (urlMaybe.isEmpty())
|
||||||
continue;
|
continue;
|
||||||
@@ -62,7 +74,7 @@ public class CrawlerRevisitor {
|
|||||||
|
|
||||||
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
|
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
|
||||||
// unlikely to produce anything meaningful for us.
|
// unlikely to produce anything meaningful for us.
|
||||||
if (doc.httpStatus != 200)
|
if (doc.httpStatus != 200 && doc.httpStatus != 206)
|
||||||
continue;
|
continue;
|
||||||
if (!doc.hasBody())
|
if (!doc.hasBody())
|
||||||
continue;
|
continue;
|
||||||
@@ -78,6 +90,7 @@ public class CrawlerRevisitor {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size++;
|
||||||
|
|
||||||
double skipProb;
|
double skipProb;
|
||||||
|
|
||||||
@@ -121,6 +134,7 @@ public class CrawlerRevisitor {
|
|||||||
}
|
}
|
||||||
// Add a WARC record so we don't repeat this
|
// Add a WARC record so we don't repeat this
|
||||||
warcRecorder.writeReferenceCopy(url,
|
warcRecorder.writeReferenceCopy(url,
|
||||||
|
cookies,
|
||||||
doc.contentType,
|
doc.contentType,
|
||||||
doc.httpStatus,
|
doc.httpStatus,
|
||||||
doc.documentBodyBytes,
|
doc.documentBodyBytes,
|
||||||
@@ -145,11 +159,15 @@ public class CrawlerRevisitor {
|
|||||||
else if (result instanceof HttpFetchResult.ResultException) {
|
else if (result instanceof HttpFetchResult.ResultException) {
|
||||||
errors++;
|
errors++;
|
||||||
}
|
}
|
||||||
|
|
||||||
recrawled++;
|
recrawled++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return recrawled;
|
logger.info("Recrawl summary {}: {} recrawled, {} retained, {} errors, {} skipped",
|
||||||
|
crawlFrontier.getDomain(), recrawled, retained, errors, skipped);
|
||||||
|
|
||||||
|
return new RecrawlMetadata(size, errors, skipped);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record RecrawlMetadata(int size, int errors, int skipped) {}
|
||||||
}
|
}
|
||||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.model.body.HttpFetchResult;
|
|||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public record DocumentWithReference(
|
public record DocumentWithReference(
|
||||||
@Nullable CrawledDocument doc,
|
@Nullable CrawledDocument doc,
|
||||||
@@ -33,8 +34,22 @@ public record DocumentWithReference(
|
|||||||
return false;
|
return false;
|
||||||
if (doc == null)
|
if (doc == null)
|
||||||
return false;
|
return false;
|
||||||
if (doc.documentBodyBytes.length == 0)
|
if (doc.documentBodyBytes.length == 0) {
|
||||||
return false;
|
if (doc.httpStatus < 300) {
|
||||||
|
return resultOk.bytesLength() == 0;
|
||||||
|
}
|
||||||
|
else if (doc.httpStatus == 301 || doc.httpStatus == 302 || doc.httpStatus == 307) {
|
||||||
|
@Nullable
|
||||||
|
String docLocation = doc.getHeader("Location");
|
||||||
|
@Nullable
|
||||||
|
String resultLocation = resultOk.header("Location");
|
||||||
|
|
||||||
|
return Objects.equals(docLocation, resultLocation);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return doc.httpStatus == resultOk.statusCode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return CrawlDataReference.isContentBodySame(doc.documentBodyBytes, resultOk.bytesRaw());
|
return CrawlDataReference.isContentBodySame(doc.documentBodyBytes, resultOk.bytesRaw());
|
||||||
}
|
}
|
||||||
@@ -43,7 +58,7 @@ public record DocumentWithReference(
|
|||||||
if (null == doc)
|
if (null == doc)
|
||||||
return ContentTags.empty();
|
return ContentTags.empty();
|
||||||
|
|
||||||
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
|
if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
|
||||||
return ContentTags.empty();
|
return ContentTags.empty();
|
||||||
|
|
||||||
String lastmod = doc.getLastModified();
|
String lastmod = doc.getLastModified();
|
||||||
|
@@ -41,6 +41,8 @@ dependencies {
|
|||||||
implementation libs.snakeyaml
|
implementation libs.snakeyaml
|
||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
|
|
||||||
|
implementation libs.bundles.httpcomponents
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
@@ -1,22 +1,32 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
public class ContentTypes {
|
public class ContentTypes {
|
||||||
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||||
"application/xhtml",
|
"application/xhtml",
|
||||||
"text/html",
|
"text/html",
|
||||||
|
"text/markdown",
|
||||||
|
"text/x-markdown",
|
||||||
|
"application/pdf",
|
||||||
"image/x-icon",
|
"image/x-icon",
|
||||||
"text/plain");
|
"text/plain");
|
||||||
|
|
||||||
public static boolean isAccepted(String contentTypeHeader) {
|
public static boolean isAccepted(String contentTypeHeader) {
|
||||||
String lcHeader = contentTypeHeader.toLowerCase();
|
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
|
||||||
for (var type : acceptedContentTypes) {
|
for (var type : acceptedContentTypes) {
|
||||||
if (lcHeader.startsWith(type)) {
|
if (lcHeader.equals(type)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static boolean isBinary(String contentTypeHeader) {
|
||||||
|
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
|
||||||
|
return lcHeader.startsWith("application/pdf");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
|||||||
{
|
{
|
||||||
|
|
||||||
String fileName = fullPath.getFileName().toString();
|
String fileName = fullPath.getFileName().toString();
|
||||||
if (fileName.endsWith(".parquet")) {
|
|
||||||
|
if (fileName.endsWith(".slop.zip")) {
|
||||||
try {
|
try {
|
||||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
return new SlopSerializableCrawlDataStream(fullPath);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.error("Error reading domain data from " + fullPath, ex);
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
return SerializableCrawlDataStream.empty();
|
return SerializableCrawlDataStream.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fileName.endsWith(".slop.zip")) {
|
else if (fileName.endsWith(".parquet")) {
|
||||||
|
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
|
||||||
try {
|
try {
|
||||||
return new SlopSerializableCrawlDataStream(fullPath);
|
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.error("Error reading domain data from " + fullPath, ex);
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
return SerializableCrawlDataStream.empty();
|
return SerializableCrawlDataStream.empty();
|
||||||
|
@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
nextRecord.body,
|
nextRecord.body,
|
||||||
// this field isn't actually used, maybe we can skip calculating it?
|
// this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.cookies,
|
nextRecord.cookies,
|
||||||
|
-1,
|
||||||
lastModified,
|
lastModified,
|
||||||
etag));
|
etag));
|
||||||
}
|
}
|
||||||
|
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
|||||||
public boolean filter(String url, int status, String contentType) {
|
public boolean filter(String url, int status, String contentType) {
|
||||||
String ctLc = contentType.toLowerCase();
|
String ctLc = contentType.toLowerCase();
|
||||||
|
|
||||||
|
// Permit all plain text content types
|
||||||
if (ctLc.startsWith("text/"))
|
if (ctLc.startsWith("text/"))
|
||||||
return true;
|
return true;
|
||||||
|
// PDF
|
||||||
|
else if (ctLc.startsWith("application/pdf"))
|
||||||
|
return true;
|
||||||
else if (ctLc.startsWith("x-marginalia/"))
|
else if (ctLc.startsWith("x-marginalia/"))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
@@ -162,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
|||||||
nextRecord.body(),
|
nextRecord.body(),
|
||||||
// this field isn't actually used, maybe we can skip calculating it?
|
// this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.cookies(),
|
nextRecord.cookies(),
|
||||||
|
nextRecord.requestTimeMs(),
|
||||||
null,
|
null,
|
||||||
null));
|
null));
|
||||||
}
|
}
|
||||||
|
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
public class ContentTypeLogic {
|
public class ContentTypeLogic {
|
||||||
|
|
||||||
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
|
private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
|
||||||
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
|
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
|
||||||
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
|
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
|
||||||
private static final List<String> acceptedContentTypePrefixes = List.of(
|
private static final List<String> acceptedContentTypePrefixes = List.of(
|
||||||
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
|
|||||||
"application/rss+xml",
|
"application/rss+xml",
|
||||||
"application/x-rss+xml",
|
"application/x-rss+xml",
|
||||||
"application/rdf+xml",
|
"application/rdf+xml",
|
||||||
|
"application/pdf",
|
||||||
"x-rss+xml"
|
"x-rss+xml"
|
||||||
);
|
);
|
||||||
private boolean allowAllContentTypes = false;
|
private boolean allowAllContentTypes = false;
|
||||||
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
|
|||||||
public boolean isUrlLikeBinary(EdgeUrl url) {
|
public boolean isUrlLikeBinary(EdgeUrl url) {
|
||||||
String pathLowerCase = url.path.toLowerCase();
|
String pathLowerCase = url.path.toLowerCase();
|
||||||
|
|
||||||
if (probableHtmlPattern.test(pathLowerCase))
|
if (probableGoodPattern.test(pathLowerCase))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return probableBinaryPattern.test(pathLowerCase);
|
return probableBinaryPattern.test(pathLowerCase);
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user