mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
72 Commits
deploy-012
...
deploy-017
Author | SHA1 | Date | |
---|---|---|---|
|
fd5af01629 | ||
|
d4c43c7a79 | ||
|
18700e1919 | ||
|
120b431998 | ||
|
71dad99326 | ||
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 | ||
|
413d5cc788 | ||
|
58539b92ac | ||
|
fe72f16df1 | ||
|
b49a244a2e | ||
|
3f0b4c010f | ||
|
c6e0cd93f7 | ||
|
80a7ccb080 | ||
|
54dec347c4 | ||
|
d6ee3f0785 | ||
|
8be88afcf3 | ||
|
0e3c00d3e1 | ||
|
4279a7f1aa | ||
|
251006d4f9 | ||
|
c3e99dc12a | ||
|
aaaa2de022 | ||
|
fc1388422a | ||
|
b07080db16 | ||
|
e9d86dca4a | ||
|
1d693f0efa | ||
|
5874a163dc | ||
|
5ec7a1deab | ||
|
7fea2808ed | ||
|
8da74484f0 | ||
|
923d5a7234 | ||
|
58f88749b8 | ||
|
77f727a5ba | ||
|
667cfb53dc | ||
|
fe36d4ed20 | ||
|
acf4bef98d | ||
|
2a737c34bb | ||
|
90a577af82 | ||
|
f0c9b935d8 | ||
|
7b5493dd51 | ||
|
c246a59158 | ||
|
0b99781d24 | ||
|
39db9620c1 | ||
|
1781599363 | ||
|
6b2d18fb9b | ||
|
59b1d200ab | ||
|
897010a2cf | ||
|
602af7a77e | ||
|
a7d91c8527 | ||
|
7151602124 | ||
|
884e33bd4a | ||
|
e84d5c497a | ||
|
2d2d3e2466 | ||
|
647dd9b12f | ||
|
de4e2849ce | ||
|
3c43f1954e | ||
|
fa2462ec39 | ||
|
f4ad7145db | ||
|
068b450180 | ||
|
05b909a21f | ||
|
3d179cddce | ||
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce |
@@ -5,7 +5,7 @@ plugins {
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
@@ -47,7 +47,7 @@ ext {
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.4'
|
||||
jibVersion = '3.4.5'
|
||||
}
|
||||
|
||||
idea {
|
||||
|
@@ -1,16 +1,14 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.util.QueryParams;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.Serializable;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class EdgeUrl implements Serializable {
|
||||
public final String proto;
|
||||
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
private static URI parseURI(String url) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(urlencodeFixer(url));
|
||||
return EdgeUriFactory.parseURILenient(url);
|
||||
} catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||
}
|
||||
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
||||
|
||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
||||
|
||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
||||
and what you get is more like what's on the inside, we try to patch things instead,
|
||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
||||
like bad or missing URLEncoding
|
||||
*/
|
||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
||||
var s = new StringBuilder();
|
||||
String goodChars = "&.?:/-;+$#";
|
||||
String hexChars = "0123456789abcdefABCDEF";
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return url + "/";
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
for (int i = pathIdx; i < end; i++) {
|
||||
int c = url.charAt(i);
|
||||
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
s.appendCodePoint(c);
|
||||
} else if (c == '%' && i + 2 < end) {
|
||||
int cn = url.charAt(i + 1);
|
||||
int cnn = url.charAt(i + 2);
|
||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||
s.appendCodePoint(c);
|
||||
} else {
|
||||
s.append("%25");
|
||||
}
|
||||
} else {
|
||||
s.append(String.format("%%%02X", c));
|
||||
}
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
private static int findPathIdx(String url) throws URISyntaxException {
|
||||
int colonIdx = url.indexOf(':');
|
||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking protocol");
|
||||
}
|
||||
return url.indexOf('/', colonIdx + 2);
|
||||
}
|
||||
|
||||
public EdgeUrl(URI URI) {
|
||||
try {
|
||||
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
EdgeUriFactory.urlencodePath(sb, path);
|
||||
|
||||
if (param != null) {
|
||||
EdgeUriFactory.urlencodeQuery(sb, param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public String toDisplayString() {
|
||||
StringBuilder sb = new StringBuilder(256);
|
||||
|
||||
sb.append(proto);
|
||||
sb.append("://");
|
||||
sb.append(domain);
|
||||
|
||||
if (port != null) {
|
||||
sb.append(':');
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
sb.append(path);
|
||||
|
||||
if (param != null) {
|
||||
sb.append('?');
|
||||
sb.append(param);
|
||||
sb.append('?').append(param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class EdgeUriFactory {
|
||||
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||
|
||||
if (shouldOmitUrlencodeRepair(url)) {
|
||||
try {
|
||||
return new URI(url);
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
// ignore and run the lenient parser
|
||||
}
|
||||
}
|
||||
|
||||
var s = new StringBuilder(url.length()+8);
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return new URI(url + "/");
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
int queryIdx = url.indexOf('?');
|
||||
if (queryIdx < 0) queryIdx = end;
|
||||
|
||||
urlencodePath(s, url.substring(pathIdx, queryIdx));
|
||||
if (queryIdx < end) {
|
||||
urlencodeQuery(s, url.substring(queryIdx + 1, end));
|
||||
}
|
||||
return new URI(s.toString());
|
||||
}
|
||||
|
||||
/** Break apart the path element of an URI into its components, and then
|
||||
* urlencode any component that needs it, and recombine it into a single
|
||||
* path element again.
|
||||
*/
|
||||
public static void urlencodePath(StringBuilder sb, String path) {
|
||||
if (path == null || path.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] pathParts = StringUtils.split(path, '/');
|
||||
if (pathParts.length == 0) {
|
||||
sb.append('/');
|
||||
return;
|
||||
}
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(pathPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
sb.append('/');
|
||||
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||
} else {
|
||||
sb.append('/');
|
||||
sb.append(pathPart);
|
||||
}
|
||||
}
|
||||
|
||||
if (path.endsWith("/")) {
|
||||
sb.append('/');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Break apart the query element of a URI into its components, and then
|
||||
* urlencode any component that needs it, and recombine it into a single
|
||||
* query element again.
|
||||
*/
|
||||
public static void urlencodeQuery(StringBuilder sb, String param) {
|
||||
if (param == null || param.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] queryParts = StringUtils.split(param, '&');
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(queryPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
boolean first = true;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (first) {
|
||||
sb.append('?');
|
||||
first = false;
|
||||
} else {
|
||||
sb.append('&');
|
||||
}
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
int idx = queryPart.indexOf('=');
|
||||
if (idx < 0) {
|
||||
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||
} else {
|
||||
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
|
||||
sb.append('=');
|
||||
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
|
||||
}
|
||||
} else {
|
||||
sb.append(queryPart);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Test if the url element needs URL encoding.
|
||||
* <p></p>
|
||||
* Note we may have been given an already encoded path element,
|
||||
* so we include % and + in the list of good characters
|
||||
*/
|
||||
static boolean needsUrlEncode(String urlElement) {
|
||||
for (int i = 0; i < urlElement.length(); i++) {
|
||||
char c = urlElement.charAt(i);
|
||||
|
||||
if (isUrlSafe(c)) continue;
|
||||
if ("+".indexOf(c) >= 0) continue;
|
||||
if (c == '%' && i + 2 < urlElement.length()) {
|
||||
char c1 = urlElement.charAt(i + 1);
|
||||
char c2 = urlElement.charAt(i + 2);
|
||||
if (isHexDigit(c1) && isHexDigit(c2)) {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static boolean isUrlSafe(int c) {
|
||||
if (c >= 'a' && c <= 'z') return true;
|
||||
if (c >= 'A' && c <= 'Z') return true;
|
||||
if (c >= '0' && c <= '9') return true;
|
||||
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Test if the URL is a valid URL that does not need to be
|
||||
* urlencoded.
|
||||
* <p></p>
|
||||
* This is a very simple heuristic test that does not guarantee
|
||||
* that the URL is valid, but it will identify cases where we
|
||||
* are fairly certain that the URL does not need encoding,
|
||||
* so we can skip a bunch of allocations and string operations
|
||||
* that would otherwise be needed to fix the URL.
|
||||
*/
|
||||
static boolean shouldOmitUrlencodeRepair(String url) {
|
||||
int idx = 0;
|
||||
final int len = url.length();
|
||||
|
||||
// Validate the scheme
|
||||
while (idx < len - 2) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == ':') break;
|
||||
if (!isAsciiAlphabetic(c)) return false;
|
||||
}
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
|
||||
// Validate the authority
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '/') break;
|
||||
if (c == ':') continue;
|
||||
if (c == '@') continue;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
// Validate the path
|
||||
if (idx >= len) return true;
|
||||
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '?') break;
|
||||
if (c == '/') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
if (idx >= len) return true;
|
||||
|
||||
// Validate the query
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '&') continue;
|
||||
if (c == '=') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private static boolean isAsciiAlphabetic(int c) {
|
||||
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
private static boolean isHexDigit(int c) {
|
||||
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
/** Find the index of the path element in a URL.
|
||||
* <p></p>
|
||||
* The path element starts after the scheme and authority part of the URL,
|
||||
* which is everything up to and including the first slash after the colon.
|
||||
*/
|
||||
private static int findPathIdx(String url) throws URISyntaxException {
|
||||
int colonIdx = url.indexOf(':');
|
||||
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking scheme");
|
||||
}
|
||||
return url.indexOf('/', colonIdx + 3);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
@@ -21,25 +21,70 @@ class EdgeUrlTest {
|
||||
new EdgeUrl("https://memex.marginalia.nu/#here")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParam() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
}
|
||||
@Test
|
||||
void urlencodeFixer() throws URISyntaxException {
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
||||
void testUriFromString() throws URISyntaxException {
|
||||
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
|
||||
// converting it back to a string, we want to ensure there is no changes along the way.
|
||||
|
||||
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
|
||||
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
|
||||
|
||||
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
|
||||
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
|
||||
|
||||
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
|
||||
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
|
||||
|
||||
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
|
||||
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParms() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
|
||||
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
|
||||
|
||||
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
|
||||
|
||||
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
|
||||
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
|
||||
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
|
||||
}
|
||||
}
|
@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
int lastProgress = this.progress;
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
if (this.progress / 10 != lastProgress / 10) {
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
}
|
||||
|
||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||
|
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
int lastProgress = this.progress;
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
|
||||
if (this.progress / 10 != lastProgress / 10) {
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
|
@@ -122,6 +122,11 @@ public class JoobyService {
|
||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||
options.setCompressionLevel(1);
|
||||
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
||||
.forNode(node)
|
||||
.run(RpcExportSampleData.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setSize(size)
|
||||
.setCtFilter(ctFilter)
|
||||
.setName(name)
|
||||
.build());
|
||||
}
|
||||
|
@@ -100,6 +100,7 @@ message RpcExportSampleData {
|
||||
int64 fileStorageId = 1;
|
||||
int32 size = 2;
|
||||
string name = 3;
|
||||
string ctFilter = 4;
|
||||
}
|
||||
message RpcDownloadSampleData {
|
||||
string sampleSet = 1;
|
||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.ERROR)
|
||||
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
Files.deleteIfExists(Path.of(tarFileName));
|
||||
|
||||
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
is.transferTo(os);
|
||||
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
|
||||
|
||||
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
|
||||
long size = urlConnection.getContentLengthLong();
|
||||
byte[] buffer = new byte[8192];
|
||||
|
||||
try (var is = new BufferedInputStream(urlConnection.getInputStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
long copiedSize = 0;
|
||||
|
||||
while (copiedSize < size) {
|
||||
int read = is.read(buffer);
|
||||
|
||||
if (read < 0) // We've been promised a file of length 'size'
|
||||
throw new IOException("Unexpected end of stream");
|
||||
|
||||
os.write(buffer, 0, read);
|
||||
copiedSize += read;
|
||||
|
||||
// Update progress bar
|
||||
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
||||
logger.error("Error downloading sample", ex);
|
||||
yield new Error();
|
||||
}
|
||||
finally {
|
||||
urlConnection.disconnect();
|
||||
}
|
||||
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
||||
yield new Extract(fileStorageId, tarFileName);
|
||||
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public DownloadSampleActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
ServiceEventLog eventLog)
|
||||
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.eventLog = eventLog;
|
||||
this.heartbeat = heartbeat;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
|
||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
||||
this(crawlId, destId, size, name, -1);
|
||||
public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
|
||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
|
||||
this(crawlId, destId, size, name, ctFilter,-1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
||||
case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||
"crawl-sample-export",
|
||||
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
||||
);
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id(), size, name);
|
||||
yield new Run(crawlId, storage.id(), size, ctFilter, name);
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
|
||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
|
||||
yield new Run(crawlId, destId, size, name, newMsgId);
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
|
||||
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||
}
|
||||
case Run(_, FileStorageId destId, _, _, long msgId) -> {
|
||||
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export RSS/Atom feeds from crawl data";
|
||||
return "Export sample crawl data";
|
||||
}
|
||||
|
||||
@Inject
|
||||
|
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
|
||||
new ExportSampleDataActor.Export(
|
||||
FileStorageId.of(request.getFileStorageId()),
|
||||
request.getSize(),
|
||||
request.getCtFilter(),
|
||||
request.getName()
|
||||
)
|
||||
);
|
||||
|
@@ -229,13 +229,15 @@ public class FeedFetcherService {
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
;
|
||||
|
||||
if (ifModifiedSinceDate != null) {
|
||||
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||
// though since there are certain idiosyncrasies in server implementations,
|
||||
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
} else if (ifModifiedSinceDate != null) {
|
||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||
}
|
||||
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
}
|
||||
|
||||
HttpRequest getRequest = requestBuilder.build();
|
||||
|
||||
|
@@ -20,7 +20,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(fileName, new BasicCookieStore());
|
||||
try (var recorder = new WarcRecorder(fileName);
|
||||
var db = new DomainStateDb(dbTempFile))
|
||||
{
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||
|
@@ -67,8 +67,6 @@ dependencies {
|
||||
testImplementation libs.mockito
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
||||
|
@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
|
||||
import java.security.Security;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
||||
|
||||
private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
|
||||
|
||||
private final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||
private final HttpFetcherImpl fetcher;
|
||||
|
||||
@@ -261,28 +264,44 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
if (workLog.isJobFinished(crawlSpec.domain))
|
||||
continue;
|
||||
|
||||
var task = new CrawlTask(
|
||||
crawlSpec,
|
||||
anchorTagsSource,
|
||||
outputDir,
|
||||
warcArchiver,
|
||||
domainStateDb,
|
||||
workLog);
|
||||
var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
|
||||
|
||||
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
|
||||
if (!trySubmitDeferredTask(task)) {
|
||||
// Otherwise add to the taskList for deferred execution
|
||||
|
||||
// Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
|
||||
retryQueue.drainTo(taskList);
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
// Then add this new task to the retry queue
|
||||
taskList.add(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule viable tasks for execution until list is empty
|
||||
while (!taskList.isEmpty()) {
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
for (int emptyRuns = 0;emptyRuns < 300;) {
|
||||
boolean hasTasks = !taskList.isEmpty();
|
||||
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(50);
|
||||
// The order of these checks very important to avoid a race condition
|
||||
// where we miss a task that is put into the retry queue
|
||||
boolean hasRunningTasks = pool.getActiveCount() > 0;
|
||||
boolean hasRetryTasks = !retryQueue.isEmpty();
|
||||
|
||||
if (hasTasks || hasRetryTasks || hasRunningTasks) {
|
||||
retryQueue.drainTo(taskList);
|
||||
|
||||
// Try to submit any tasks that are in the retry queue (this will block if the pool is full)
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(5);
|
||||
} else {
|
||||
// We have no tasks to run, and no tasks in the retry queue
|
||||
// but we wait a bit to see if any new tasks come in via the retry queue
|
||||
emptyRuns++;
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||
@@ -414,7 +433,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.canLock(new EdgeDomain(domain));
|
||||
return domainLocks.isLockableHint(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -425,66 +444,76 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
return;
|
||||
}
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
|
||||
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
||||
// while writing to the same file name as before
|
||||
if (Files.exists(newWarcFile)) {
|
||||
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
else {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||
CrawlDataReference reference = getReference()
|
||||
)
|
||||
{
|
||||
// Resume the crawl if it was aborted
|
||||
if (Files.exists(tempFile)) {
|
||||
retriever.syncAbortedRun(tempFile);
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
|
||||
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
|
||||
int size;
|
||||
try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
|
||||
size = retriever.crawlDomain(domainLinks, reference);
|
||||
}
|
||||
|
||||
// Delete the reference crawl data if it's not the same as the new one
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
reference.delete();
|
||||
|
||||
// Convert the WARC file to Parquet
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||
|
||||
// Optionally archive the WARC file if full retention is enabled,
|
||||
// otherwise delete it:
|
||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||
|
||||
// Mark the domain as finished in the work log
|
||||
workLog.setJobToFinished(domain, slopFile.toString(), size);
|
||||
|
||||
// Update the progress bar
|
||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||
|
||||
logger.info("Fetched {}", domain);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
DomainLocks.DomainLock domainLock = lock.get();
|
||||
|
||||
Files.deleteIfExists(newWarcFile);
|
||||
Files.deleteIfExists(tempFile);
|
||||
try (domainLock) {
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
|
||||
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
||||
// while writing to the same file name as before
|
||||
if (Files.exists(newWarcFile)) {
|
||||
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
else {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||
CrawlDataReference reference = getReference())
|
||||
{
|
||||
// Resume the crawl if it was aborted
|
||||
if (Files.exists(tempFile)) {
|
||||
retriever.syncAbortedRun(tempFile);
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
|
||||
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
|
||||
int size = retriever.crawlDomain(domainLinks, reference);
|
||||
|
||||
// Delete the reference crawl data if it's not the same as the new one
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
reference.delete();
|
||||
|
||||
// Convert the WARC file to Slop
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||
|
||||
// Optionally archive the WARC file if full retention is enabled,
|
||||
// otherwise delete it:
|
||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||
|
||||
// Mark the domain as finished in the work log
|
||||
workLog.setJobToFinished(domain, slopFile.toString(), size);
|
||||
|
||||
// Update the progress bar
|
||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||
|
||||
logger.info("Fetched {}", domain);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
||||
Files.deleteIfExists(newWarcFile);
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
|
||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||
public record ContentTags(String etag, String lastMod) {
|
||||
@@ -17,14 +17,16 @@ public record ContentTags(String etag, String lastMod) {
|
||||
}
|
||||
|
||||
/** Paints the tags onto the request builder. */
|
||||
public void paint(ClassicRequestBuilder getBuilder) {
|
||||
public void paint(HttpGet request) {
|
||||
|
||||
// Paint the ETag header if present,
|
||||
// otherwise paint the Last-Modified header
|
||||
// (but not both at the same time due to some servers not liking it)
|
||||
|
||||
if (etag != null) {
|
||||
getBuilder.addHeader("If-None-Match", etag);
|
||||
}
|
||||
|
||||
if (lastMod != null) {
|
||||
getBuilder.addHeader("If-Modified-Since", lastMod);
|
||||
request.addHeader("If-None-Match", etag);
|
||||
} else if (lastMod != null) {
|
||||
request.addHeader("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,34 +0,0 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.CookieHandler;
|
||||
import java.net.URI;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class Cookies extends CookieHandler {
|
||||
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
||||
|
||||
public void clear() {
|
||||
cookieJar.get().clear();
|
||||
}
|
||||
|
||||
public boolean hasCookies() {
|
||||
return !cookieJar.get().isEmpty();
|
||||
}
|
||||
|
||||
public List<String> getCookies() {
|
||||
return cookieJar.get().values().stream().flatMap(List::stream).toList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
|
||||
return cookieJar.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
|
||||
cookieJar.get().putAll(responseHeaders);
|
||||
}
|
||||
}
|
@@ -0,0 +1,56 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class DomainCookies {
|
||||
private final Map<String, String> cookies = new HashMap<>();
|
||||
|
||||
public boolean hasCookies() {
|
||||
return !cookies.isEmpty();
|
||||
}
|
||||
|
||||
public void updateCookieStore(HttpResponse response) {
|
||||
for (var header : response.getHeaders()) {
|
||||
if (header.getName().equalsIgnoreCase("Set-Cookie")) {
|
||||
parseCookieHeader(header.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void parseCookieHeader(String value) {
|
||||
// Parse the Set-Cookie header value and extract the cookies
|
||||
|
||||
String[] parts = value.split(";");
|
||||
String cookie = parts[0].trim();
|
||||
|
||||
if (cookie.contains("=")) {
|
||||
String[] cookieParts = cookie.split("=");
|
||||
String name = cookieParts[0].trim();
|
||||
String val = cookieParts[1].trim();
|
||||
cookies.put(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
public void paintRequest(HttpUriRequestBase request) {
|
||||
request.addHeader("Cookie", createCookieHeader());
|
||||
}
|
||||
|
||||
public void paintRequest(ClassicHttpRequest request) {
|
||||
request.addHeader("Cookie", createCookieHeader());
|
||||
}
|
||||
|
||||
private String createCookieHeader() {
|
||||
StringJoiner sj = new StringJoiner("; ");
|
||||
for (var cookie : cookies.entrySet()) {
|
||||
sj.add(cookie.getKey() + "=" + cookie.getValue());
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
|
||||
}
|
@@ -23,6 +23,7 @@ public interface HttpFetcher extends AutoCloseable {
|
||||
|
||||
HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags tags,
|
||||
ProbeType probeType);
|
||||
|
@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
@@ -34,6 +35,7 @@ import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.pool.PoolStats;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -45,11 +47,14 @@ import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -76,14 +81,20 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
}
|
||||
|
||||
private final CloseableHttpClient client;
|
||||
private PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
public PoolStats getPoolStats() {
|
||||
return connectionManager.getTotalStats();
|
||||
}
|
||||
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
@@ -91,15 +102,27 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(15))
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(8, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
@@ -287,6 +310,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
* recorded in the WARC file on failure.
|
||||
*/
|
||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags tags) {
|
||||
if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
@@ -299,9 +323,11 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
var result = SendLock.wrapSend(client, head, (rsp) -> {
|
||||
EntityUtils.consume(rsp.getEntity());
|
||||
cookies.paintRequest(head);
|
||||
|
||||
return SendLock.wrapSend(client, head, (rsp) -> {
|
||||
cookies.updateCookieStore(rsp);
|
||||
EntityUtils.consume(rsp.getEntity());
|
||||
int statusCode = rsp.getCode();
|
||||
|
||||
// Handle redirects
|
||||
@@ -339,8 +365,6 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
|
||||
}
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
|
||||
@@ -362,6 +386,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags contentTags,
|
||||
ProbeType probeType)
|
||||
@@ -369,26 +394,32 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
try {
|
||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||
try {
|
||||
var probeResult = probeContentType(url, timer, contentTags);
|
||||
logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
|
||||
var probeResult = probeContentType(url, cookies, timer, contentTags);
|
||||
|
||||
switch (probeResult) {
|
||||
case HttpFetcher.ContentTypeProbeResult.NoOp():
|
||||
break; //
|
||||
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||
logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
|
||||
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||
break;
|
||||
case ContentTypeProbeResult.BadContentType badContentType:
|
||||
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
|
||||
logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
|
||||
return new HttpFetchResult.ResultNone();
|
||||
case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
|
||||
logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
|
||||
warcRecorder.flagAsTimeout(url);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
case ContentTypeProbeResult.Exception(Exception ex):
|
||||
logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
|
||||
warcRecorder.flagAsError(url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
case ContentTypeProbeResult.HttpError httpError:
|
||||
logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
|
||||
return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
|
||||
case ContentTypeProbeResult.Redirect redirect:
|
||||
logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
|
||||
return new HttpFetchResult.ResultRedirect(redirect.location());
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
@@ -398,36 +429,41 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
|
||||
}
|
||||
|
||||
ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept-Language", "en,*;q=0.5")
|
||||
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||
HttpGet request = new HttpGet(url.asURI());
|
||||
request.addHeader("User-Agent", userAgentString);
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
request.addHeader("Accept-Language", "en,*;q=0.5");
|
||||
request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||
|
||||
contentTags.paint(getBuilder);
|
||||
contentTags.paint(request);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||
Instant start = Instant.now();
|
||||
HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
|
||||
|
||||
Duration fetchDuration = Duration.between(start, Instant.now());
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 304) {
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
result = new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
}
|
||||
|
||||
switch (result) {
|
||||
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
|
||||
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
|
||||
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
|
||||
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception: {} for {}", ex.getClass().getSimpleName(), url);
|
||||
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
|
||||
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
|
||||
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
|
||||
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
||||
@@ -494,56 +530,61 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
}
|
||||
|
||||
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.build();
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
|
||||
HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
|
||||
|
||||
getRequest.addHeader("User-Agent", userAgentString);
|
||||
getRequest.addHeader("Accept-Encoding", "gzip");
|
||||
getRequest.addHeader("Accept", "text/*, */*;q=0.9");
|
||||
getRequest.addHeader("User-Agent", userAgentString);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
return client.execute(getRequest, response -> {
|
||||
if (response.getCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
try {
|
||||
if (response.getCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
Document parsedSitemap = Jsoup.parse(
|
||||
EntityUtils.toString(response.getEntity()),
|
||||
sitemapUrl.toString(),
|
||||
Parser.xmlParser()
|
||||
);
|
||||
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
}
|
||||
|
||||
Document parsedSitemap = Jsoup.parse(
|
||||
EntityUtils.toString(response.getEntity()),
|
||||
sitemapUrl.toString(),
|
||||
Parser.xmlParser()
|
||||
);
|
||||
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
finally {
|
||||
EntityUtils.consume(response.getEntity());
|
||||
}
|
||||
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@@ -574,13 +615,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||
try (var sl = new SendLock()) {
|
||||
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||
.build();
|
||||
HttpGet request = new HttpGet(url.asURI());
|
||||
request.addHeader("User-Agent", userAgentString);
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
request.addHeader("Accept", "text/*, */*;q=0.9");
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, request);
|
||||
HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);
|
||||
|
||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||
robotsParser.parseContent(url.toString(),
|
||||
@@ -596,18 +636,19 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
if (exception instanceof SocketTimeoutException ex) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return executionCount < 3;
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount < 2;
|
||||
case 429 -> executionCount < 3;
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
* and suppressed from the headers.
|
||||
* If an error occurs, a buffer will be created with no content and an error status.
|
||||
*/
|
||||
static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException {
|
||||
static WarcInputBuffer forResponse(ClassicHttpResponse response,
|
||||
HttpGet request,
|
||||
Duration timeLimit) throws IOException {
|
||||
if (response == null)
|
||||
return new ErrorBuffer();
|
||||
|
||||
@@ -54,16 +57,47 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
return new ErrorBuffer();
|
||||
}
|
||||
|
||||
InputStream is = entity.getContent();
|
||||
long length = entity.getContentLength();
|
||||
Instant start = Instant.now();
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = entity.getContent();
|
||||
long length = entity.getContentLength();
|
||||
|
||||
try (response) {
|
||||
if (length > 0 && length < 8192) {
|
||||
// If the content is small and not compressed, we can just read it into memory
|
||||
return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length);
|
||||
return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
|
||||
} else {
|
||||
// Otherwise, we unpack it into a file and read it from there
|
||||
return new FileBuffer(response.getHeaders(), timeLimit, is);
|
||||
return new FileBuffer(response.getHeaders(), request, timeLimit, is);
|
||||
}
|
||||
}
|
||||
finally {
|
||||
// We're required to consume the stream to avoid leaking connections,
|
||||
// but we also don't want to get stuck on slow or malicious connections
|
||||
// forever, so we set a time limit on this phase and call abort() if it's exceeded.
|
||||
try {
|
||||
while (is != null) {
|
||||
// Consume some data
|
||||
if (is.skip(65536) == 0) {
|
||||
// Note that skip may return 0 if the stream is empty
|
||||
// or for other unspecified reasons, so we need to check
|
||||
// with read() as well to determine if the stream is done
|
||||
if (is.read() == -1)
|
||||
is = null;
|
||||
}
|
||||
// Check if the time limit has been exceeded
|
||||
else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
|
||||
request.abort();
|
||||
is = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Ignore the exception
|
||||
}
|
||||
finally {
|
||||
// Close the input stream
|
||||
IOUtils.closeQuietly(is);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +105,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
}
|
||||
|
||||
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
||||
protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
|
||||
protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
|
||||
Instant start = Instant.now();
|
||||
Instant timeout = start.plus(timeLimit);
|
||||
long size = 0;
|
||||
@@ -86,6 +120,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
Duration remaining = Duration.between(Instant.now(), timeout);
|
||||
if (remaining.isNegative()) {
|
||||
truncationReason = WarcTruncationReason.TIME;
|
||||
// Abort the request if the time limit is exceeded
|
||||
// so we don't keep the connection open forever or are forced to consume
|
||||
// the stream to the end
|
||||
|
||||
request.abort();
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -104,6 +143,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
}
|
||||
else if (truncationReason != WarcTruncationReason.LENGTH) {
|
||||
truncationReason = WarcTruncationReason.LENGTH;
|
||||
break;
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
@@ -111,13 +151,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
// Try to close the connection as long as we haven't timed out.
|
||||
// As per Apache HttpClient's semantics, this will reset the connection
|
||||
// and close the stream if we have timed out.
|
||||
|
||||
if (truncationReason != WarcTruncationReason.TIME) {
|
||||
IOUtils.closeQuietly(is);
|
||||
}
|
||||
}
|
||||
|
||||
/** Takes a Content-Range header and checks if it is complete.
|
||||
@@ -218,7 +251,7 @@ class ErrorBuffer extends WarcInputBuffer {
|
||||
/** Buffer for when we have the response in memory */
|
||||
class MemoryBuffer extends WarcInputBuffer {
|
||||
byte[] data;
|
||||
public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) {
|
||||
public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
if (!isRangeComplete(headers)) {
|
||||
@@ -229,7 +262,7 @@ class MemoryBuffer extends WarcInputBuffer {
|
||||
|
||||
var outputStream = new ByteArrayOutputStream(size);
|
||||
|
||||
copy(responseStream, outputStream, timeLimit);
|
||||
copy(responseStream, request, outputStream, timeLimit);
|
||||
|
||||
data = outputStream.toByteArray();
|
||||
}
|
||||
@@ -253,7 +286,7 @@ class MemoryBuffer extends WarcInputBuffer {
|
||||
class FileBuffer extends WarcInputBuffer {
|
||||
private final Path tempFile;
|
||||
|
||||
public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||
public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
if (!isRangeComplete(headers)) {
|
||||
@@ -265,7 +298,7 @@ class FileBuffer extends WarcInputBuffer {
|
||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(responseStream, out, timeLimit);
|
||||
copy(responseStream, request, out, timeLimit);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
@@ -8,9 +9,7 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.core5.http.NameValuePair;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.netpreserve.jwarc.*;
|
||||
@@ -42,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
static final int MAX_TIME = 30_000;
|
||||
|
||||
/** Maximum (decompressed) size we'll save */
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
|
||||
|
||||
private final WarcWriter writer;
|
||||
private final Path warcFile;
|
||||
@@ -53,23 +52,15 @@ public class WarcRecorder implements AutoCloseable {
|
||||
// Affix a version string in case we need to change the format in the future
|
||||
// in some way
|
||||
private final String warcRecorderVersion = "1.0";
|
||||
private final CookieStore cookies;
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
/**
|
||||
* Create a new WarcRecorder that will write to the given file
|
||||
*
|
||||
* @param warcFile The file to write to
|
||||
*/
|
||||
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
|
||||
public WarcRecorder(Path warcFile) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = fetcher.getCookies();
|
||||
}
|
||||
|
||||
public WarcRecorder(Path warcFile, CookieStore cookies) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = cookies;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -79,24 +70,21 @@ public class WarcRecorder implements AutoCloseable {
|
||||
public WarcRecorder() throws IOException {
|
||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||
this.writer = new WarcWriter(this.warcFile);
|
||||
this.cookies = new BasicCookieStore();
|
||||
|
||||
temporaryFile = true;
|
||||
}
|
||||
|
||||
private boolean hasCookies() {
|
||||
return !cookies.getCookies().isEmpty();
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
ClassicHttpRequest request)
|
||||
DomainCookies cookies,
|
||||
HttpGet request)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
return fetch(client, request, Duration.ofMillis(MAX_TIME));
|
||||
return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
ClassicHttpRequest request,
|
||||
DomainCookies cookies,
|
||||
HttpGet request,
|
||||
Duration timeout)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
@@ -113,13 +101,15 @@ public class WarcRecorder implements AutoCloseable {
|
||||
// Inject a range header to attempt to limit the size of the response
|
||||
// to the maximum size we want to store, if the server supports it.
|
||||
request.addHeader("Range", "bytes=0-"+MAX_SIZE);
|
||||
|
||||
cookies.paintRequest(request);
|
||||
try {
|
||||
return client.execute(request, response -> {
|
||||
return client.execute(request,response -> {
|
||||
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout);
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||
InputStream inputStream = inputBuffer.read()) {
|
||||
|
||||
cookies.updateCookieStore(response);
|
||||
|
||||
// Build and write the request
|
||||
|
||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||
@@ -143,8 +133,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
if (hasCookies()) {
|
||||
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
||||
|
||||
if (cookies.hasCookies()) {
|
||||
response.addHeader("X-Has-Cookies", 1);
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
@@ -259,7 +250,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
writer.write(item);
|
||||
}
|
||||
|
||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
try {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
@@ -320,7 +311,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
.date(Instant.now())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
if (hasCookies()) {
|
||||
if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
|
||||
builder.addHeader("X-Has-Cookies", "1");
|
||||
}
|
||||
|
||||
@@ -340,8 +331,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||
*/
|
||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
|
||||
public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
|
||||
}
|
||||
|
||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
@@ -19,8 +20,22 @@ public class DomainLocks {
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return new DomainLock(domain.toString(),
|
||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
|
||||
sem.acquire();
|
||||
|
||||
return new DomainLock(sem);
|
||||
}
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1)) {
|
||||
return Optional.of(new DomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
@@ -28,23 +43,27 @@ public class DomainLocks {
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
return new Semaphore(8);
|
||||
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public boolean canLock(EdgeDomain domain) {
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
|
||||
* after this method returns true)
|
||||
*/
|
||||
public boolean isLockableHint(EdgeDomain domain) {
|
||||
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||
if (null == sem)
|
||||
return true;
|
||||
@@ -53,22 +72,16 @@ public class DomainLocks {
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
||||
this.domainName = domainName;
|
||||
DomainLock(Semaphore semaphore) {
|
||||
this.semaphore = semaphore;
|
||||
|
||||
Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
|
||||
semaphore.acquire();
|
||||
Thread.currentThread().setName("crawling:" + domainName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
|
||||
Thread.currentThread().setName("[idle]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||
@@ -51,9 +52,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private final DomainStateDb domainStateDb;
|
||||
private final WarcRecorder warcRecorder;
|
||||
private final CrawlerRevisitor crawlerRevisitor;
|
||||
private final DomainCookies cookies = new DomainCookies();
|
||||
|
||||
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
|
||||
Duration.ofSeconds(1) // pace the connections to avoid network congestion by waiting 1 second between establishing them
|
||||
Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
|
||||
);
|
||||
|
||||
int errorCount = 0;
|
||||
@@ -124,7 +126,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
|
||||
Instant recrawlStart = Instant.now();
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
|
||||
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
@@ -274,7 +276,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
try {
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
|
||||
@@ -337,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
// Grab the favicon if it exists
|
||||
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
@@ -407,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
if (parsedOpt.isEmpty())
|
||||
return false;
|
||||
|
||||
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||
@@ -435,7 +437,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
{
|
||||
var contentTags = reference.getContentTags();
|
||||
|
||||
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, timer, contentTags, HttpFetcher.ProbeType.FULL);
|
||||
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
|
||||
timer.waitFetchDelay();
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
@@ -461,7 +463,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
{
|
||||
var doc = reference.doc();
|
||||
|
||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||
warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||
|
||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||
new ContentType(doc.contentType, "UTF-8"),
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;
|
||||
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
@@ -37,6 +38,7 @@ public class CrawlerRevisitor {
|
||||
|
||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
|
||||
DomainCookies cookies,
|
||||
SimpleRobotRules robotsRules,
|
||||
CrawlDelayTimer delayTimer)
|
||||
throws InterruptedException {
|
||||
@@ -72,7 +74,7 @@ public class CrawlerRevisitor {
|
||||
|
||||
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
|
||||
// unlikely to produce anything meaningful for us.
|
||||
if (doc.httpStatus != 200)
|
||||
if (doc.httpStatus != 200 && doc.httpStatus != 206)
|
||||
continue;
|
||||
if (!doc.hasBody())
|
||||
continue;
|
||||
@@ -132,6 +134,7 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
// Add a WARC record so we don't repeat this
|
||||
warcRecorder.writeReferenceCopy(url,
|
||||
cookies,
|
||||
doc.contentType,
|
||||
doc.httpStatus,
|
||||
doc.documentBodyBytes,
|
||||
|
@@ -58,7 +58,7 @@ public record DocumentWithReference(
|
||||
if (null == doc)
|
||||
return ContentTags.empty();
|
||||
|
||||
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
|
||||
if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
|
||||
return ContentTags.empty();
|
||||
|
||||
String lastmod = doc.getLastModified();
|
||||
|
@@ -1,22 +1,30 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public class ContentTypes {
|
||||
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||
"application/xhtml",
|
||||
"text/html",
|
||||
"application/pdf",
|
||||
"image/x-icon",
|
||||
"text/plain");
|
||||
|
||||
public static boolean isAccepted(String contentTypeHeader) {
|
||||
String lcHeader = contentTypeHeader.toLowerCase();
|
||||
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
|
||||
for (var type : acceptedContentTypes) {
|
||||
if (lcHeader.startsWith(type)) {
|
||||
if (lcHeader.equals(type)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isBinary(String contentTypeHeader) {
|
||||
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
|
||||
return lcHeader.startsWith("application/pdf");
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
String ctLc = contentType.toLowerCase();
|
||||
|
||||
// Permit all plain text content types
|
||||
if (ctLc.startsWith("text/"))
|
||||
return true;
|
||||
// PDF
|
||||
else if (ctLc.startsWith("application/pdf"))
|
||||
return true;
|
||||
else if (ctLc.startsWith("x-marginalia/"))
|
||||
return true;
|
||||
|
||||
|
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class ContentTypeLogic {
|
||||
|
||||
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
|
||||
private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
|
||||
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
|
||||
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
|
||||
private static final List<String> acceptedContentTypePrefixes = List.of(
|
||||
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
|
||||
"application/rss+xml",
|
||||
"application/x-rss+xml",
|
||||
"application/rdf+xml",
|
||||
"application/pdf",
|
||||
"x-rss+xml"
|
||||
);
|
||||
private boolean allowAllContentTypes = false;
|
||||
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
|
||||
public boolean isUrlLikeBinary(EdgeUrl url) {
|
||||
String pathLowerCase = url.path.toLowerCase();
|
||||
|
||||
if (probableHtmlPattern.test(pathLowerCase))
|
||||
if (probableGoodPattern.test(pathLowerCase))
|
||||
return false;
|
||||
|
||||
return probableBinaryPattern.test(pathLowerCase);
|
||||
|
@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the format is binary, we don't want to translate it if the response is truncated
|
||||
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -272,7 +277,8 @@ public record SlopCrawlDataRecord(String domain,
|
||||
try (var table = new SlopTable(path)) {
|
||||
ShortColumn.Reader statusReader = statusColumn.open(table);
|
||||
while (statusReader.hasRemaining()) {
|
||||
if (statusReader.get() == 200) {
|
||||
int status = statusReader.get();
|
||||
if (status == 200 || status == 206) {
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
|
@@ -11,6 +11,8 @@ import org.junit.jupiter.api.*;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplContentTypeProbeTest {
|
||||
|
||||
@@ -85,55 +87,59 @@ class HttpFetcherImplContentTypeProbeTest {
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
fetcher.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
|
||||
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
|
||||
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtmlShortcircuitTags() {
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtml() {
|
||||
var result = fetcher.probeContentType(contentTypeHtmlUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(contentTypeHtmlUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeBinary() {
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeRedirect() {
|
||||
var result = fetcher.probeContentType(redirectUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(redirectUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeBadHttpStatus() {
|
||||
var result = fetcher.probeContentType(badHttpStatusUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(badHttpStatusUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOnlyGetAllowed() {
|
||||
var result = fetcher.probeContentType(onlyGetAllowedUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(onlyGetAllowedUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTimeout() {
|
||||
var result = fetcher.probeContentType(timeoutUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(timeoutUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
|
||||
}
|
||||
|
||||
|
@@ -12,6 +12,8 @@ import org.junit.jupiter.api.*;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplDomainProbeTest {
|
||||
|
||||
@@ -47,6 +49,10 @@ class HttpFetcherImplDomainProbeTest {
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
fetcher.close();
|
||||
}
|
||||
|
||||
|
@@ -31,6 +31,7 @@ class HttpFetcherImplFetchTest {
|
||||
private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";
|
||||
|
||||
private static EdgeUrl okUrl;
|
||||
private static EdgeUrl okUrlSetsCookie;
|
||||
private static EdgeUrl okRangeResponseUrl;
|
||||
private static EdgeUrl okUrlWith304;
|
||||
|
||||
@@ -39,6 +40,8 @@ class HttpFetcherImplFetchTest {
|
||||
private static EdgeUrl badHttpStatusUrl;
|
||||
private static EdgeUrl keepAliveUrl;
|
||||
|
||||
private static EdgeUrl pdfUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
@@ -88,6 +91,19 @@ class HttpFetcherImplFetchTest {
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
okUrlSetsCookie = new EdgeUrl("http://localhost:18089/okSetCookie.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlSetsCookie.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("Set-Cookie", "test=1")
|
||||
.withStatus(200)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlSetsCookie.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("Set-Cookie", "test=1")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
@@ -117,6 +133,15 @@ class HttpFetcherImplFetchTest {
|
||||
.withHeader("Keep-Alive", "max=4, timeout=30")
|
||||
.withBody("Hello")
|
||||
));
|
||||
|
||||
|
||||
pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "application/pdf")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
wireMockServer.start();
|
||||
|
||||
}
|
||||
@@ -134,20 +159,31 @@ class HttpFetcherImplFetchTest {
|
||||
public void setUp() throws IOException {
|
||||
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||
warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
|
||||
warcRecorder = new WarcRecorder(warcFile, fetcher);
|
||||
warcRecorder = new WarcRecorder(warcFile);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
System.out.println(stats);
|
||||
|
||||
fetcher.close();
|
||||
warcRecorder.close();
|
||||
Files.deleteIfExists(warcFile);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFoo() {
|
||||
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk_NoProbe() throws IOException {
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
@@ -158,12 +194,29 @@ class HttpFetcherImplFetchTest {
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("0", response.headers().first("X-Has-Cookies").orElse("0"));
|
||||
assertEquals("0", response.http().headers().first("X-Has-Cookies").orElse("0"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOkSetsCookie() throws IOException {
|
||||
var cookies = new DomainCookies();
|
||||
var result = fetcher.fetchContent(okUrlSetsCookie, warcRecorder, cookies, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("1", response.http().headers().first("X-Has-Cookies").orElse("0"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk_FullProbe() {
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
@@ -171,7 +224,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testOk304_NoProbe() {
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||
System.out.println(result);
|
||||
@@ -180,7 +233,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testOk304_FullProbe() {
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||
System.out.println(result);
|
||||
@@ -188,7 +241,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testBadStatus_NoProbe() throws IOException {
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertFalse(result.isOk());
|
||||
@@ -202,7 +255,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testBadStatus_FullProbe() {
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertFalse(result.isOk());
|
||||
@@ -212,7 +265,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testRedirect_NoProbe() throws URISyntaxException, IOException {
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||
@@ -225,7 +278,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testRedirect_FullProbe() throws URISyntaxException {
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||
@@ -238,7 +291,7 @@ class HttpFetcherImplFetchTest {
|
||||
public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
|
||||
Instant requestStart = Instant.now();
|
||||
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||
|
||||
@@ -262,7 +315,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testRangeResponse() throws IOException {
|
||||
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
@@ -279,7 +332,7 @@ class HttpFetcherImplFetchTest {
|
||||
@Test
|
||||
public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
|
||||
Instant requestStart = Instant.now();
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
Instant requestEnd = Instant.now();
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||
@@ -302,7 +355,15 @@ class HttpFetcherImplFetchTest {
|
||||
@Test
|
||||
public void testKeepaliveUrl() {
|
||||
// mostly for smoke testing and debugger utility
|
||||
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPdf() {
|
||||
var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
@@ -319,6 +380,13 @@ class HttpFetcherImplFetchTest {
|
||||
WarcXEntityRefused.register(reader);
|
||||
|
||||
for (var record : reader) {
|
||||
// Load the body, we need to do this before we close the reader to have access to the content.
|
||||
if (record instanceof WarcRequest req) {
|
||||
req.http();
|
||||
} else if (record instanceof WarcResponse rsp) {
|
||||
rsp.http();
|
||||
}
|
||||
|
||||
records.add(record);
|
||||
}
|
||||
}
|
||||
|
@@ -1,12 +1,12 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
@Test
|
||||
void run() throws IOException, URISyntaxException {
|
||||
try (var oldRecorder = new WarcRecorder(fileName, new BasicCookieStore())) {
|
||||
try (var oldRecorder = new WarcRecorder(fileName)) {
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||
|
||||
try (var newRecorder = new WarcRecorder(outputFile, new BasicCookieStore())) {
|
||||
try (var newRecorder = new WarcRecorder(outputFile)) {
|
||||
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||
}
|
||||
|
||||
@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
|
||||
}
|
||||
|
||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
var req = ClassicRequestBuilder.get(new java.net.URI(url))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
recorder.fetch(httpClient, req);
|
||||
HttpGet request = new HttpGet(url);
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
recorder.fetch(httpClient, new DomainCookies(), request);
|
||||
}
|
||||
}
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
@@ -88,7 +89,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeOk() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -97,7 +98,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeRedir() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -106,7 +107,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeBad() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -115,7 +116,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeTimeout() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
|
@@ -1,11 +1,11 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.WarcReader;
|
||||
import org.netpreserve.jwarc.WarcRequest;
|
||||
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
|
||||
os.write("<html><body>hello</body></html>".getBytes());
|
||||
os.flush();
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
os.write(":".getBytes());
|
||||
os.flush();
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@@ -89,24 +89,22 @@ class WarcRecorderFakeServerTest {
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws Exception {
|
||||
|
||||
client.close();
|
||||
Files.delete(fileNameWarc);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fetchFast() throws Exception {
|
||||
client.fetch(httpClient,
|
||||
ClassicRequestBuilder
|
||||
.get(new java.net.URI("http://localhost:14510/fast"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build()
|
||||
);
|
||||
HttpGet request = new HttpGet("http://localhost:14510/fast");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
client.fetch(httpClient, new DomainCookies(), request);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
@@ -127,11 +125,13 @@ class WarcRecorderFakeServerTest {
|
||||
public void fetchSlow() throws Exception {
|
||||
Instant start = Instant.now();
|
||||
|
||||
HttpGet request = new HttpGet("http://localhost:14510/slow");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient,
|
||||
ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build(),
|
||||
new DomainCookies(),
|
||||
request,
|
||||
Duration.ofSeconds(1)
|
||||
);
|
||||
Instant end = Instant.now();
|
||||
@@ -149,6 +149,8 @@ class WarcRecorderFakeServerTest {
|
||||
});
|
||||
}
|
||||
|
||||
System.out.println(
|
||||
Files.readString(fileNameWarc));
|
||||
System.out.println(sampleData);
|
||||
|
||||
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
||||
|
@@ -2,14 +2,14 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -24,13 +24,14 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class WarcRecorderTest {
|
||||
Path fileNameWarc;
|
||||
Path fileNameParquet;
|
||||
Path fileNameSlop;
|
||||
WarcRecorder client;
|
||||
|
||||
HttpClient httpClient;
|
||||
@@ -39,9 +40,9 @@ class WarcRecorderTest {
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
fileNameSlop = Files.createTempFile("test", ".slop.zip");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
@@ -52,12 +53,12 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient,
|
||||
ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build()
|
||||
);
|
||||
|
||||
HttpGet request = new HttpGet("https://www.marginalia.nu/");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
@@ -78,8 +79,9 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>".getBytes(),
|
||||
@@ -102,8 +104,9 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
null,
|
||||
@@ -114,8 +117,9 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testSaveImport() throws URISyntaxException, IOException {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>".getBytes(),
|
||||
@@ -138,35 +142,46 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient, ClassicRequestBuilder
|
||||
.get(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build());
|
||||
HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
|
||||
request1.addHeader("User-agent", "test.marginalia.nu");
|
||||
request1.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, ClassicRequestBuilder
|
||||
.get(new java.net.URI("https://www.marginalia.nu/log/"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build());
|
||||
client.fetch(httpClient, new DomainCookies(), request1);
|
||||
|
||||
client.fetch(httpClient, ClassicRequestBuilder
|
||||
.get(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build());
|
||||
HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
|
||||
request2.addHeader("User-agent", "test.marginalia.nu");
|
||||
request2.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||
client.fetch(httpClient, new DomainCookies(), request2);
|
||||
|
||||
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
|
||||
request3.addHeader("User-agent", "test.marginalia.nu");
|
||||
request3.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request3);
|
||||
|
||||
HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
|
||||
request4.addHeader("User-agent", "test.marginalia.nu");
|
||||
request4.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request4);
|
||||
|
||||
SlopCrawlDataRecord.convertWarc(
|
||||
"www.marginalia.nu",
|
||||
new UserAgent("test", "test"),
|
||||
fileNameWarc,
|
||||
fileNameParquet);
|
||||
fileNameSlop);
|
||||
|
||||
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
||||
assertEquals(2, urls.size());
|
||||
List<String> urls;
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
|
||||
urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
|
||||
}
|
||||
|
||||
assertEquals(3, urls.size());
|
||||
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
||||
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
||||
// sanic.jpg gets filtered out for its bad mime type
|
||||
assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));
|
||||
|
||||
}
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.crawling;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@@ -31,7 +32,7 @@ class HttpFetcherTest {
|
||||
void fetchUTF8() throws Exception {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
@@ -49,7 +50,7 @@ class HttpFetcherTest {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
|
@@ -3,10 +3,7 @@ package nu.marginalia.crawling.retreival;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawl.fetcher.*;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
@@ -137,7 +134,7 @@ public class CrawlerMockFetcherTest {
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, DomainCookies cookies, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
|
||||
logger.info("Fetching {}", url);
|
||||
if (mockData.containsKey(url)) {
|
||||
byte[] bodyBytes = mockData.get(url).documentBodyBytes;
|
||||
|
@@ -16,7 +16,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.*;
|
||||
@@ -118,6 +117,86 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testWarcOutputPDF() throws IOException {
|
||||
var specs = CrawlerMain.CrawlSpecRecord
|
||||
.builder()
|
||||
.crawlDepth(5)
|
||||
.domain("www.marginalia.nu")
|
||||
.urls(List.of("https://www.marginalia.nu/junk/test.pdf"))
|
||||
.build();
|
||||
Path tempFile = null;
|
||||
Path slopFile = null;
|
||||
try {
|
||||
tempFile = Files.createTempFile("crawling-process", "warc");
|
||||
slopFile = Files.createTempFile("crawling-process", ".slop.zip");
|
||||
|
||||
doCrawl(tempFile, specs);
|
||||
|
||||
Set<String> requests = new HashSet<>();
|
||||
Set<String> responses = new HashSet<>();
|
||||
|
||||
// Inspect the WARC file
|
||||
try (var reader = new WarcReader(tempFile)) {
|
||||
reader.forEach(record -> {
|
||||
if (record instanceof WarcRequest req) {
|
||||
requests.add(req.target());
|
||||
System.out.println(req.type() + ":" + req.target());
|
||||
}
|
||||
else if (record instanceof WarcResponse rsp) {
|
||||
responses.add(rsp.target());
|
||||
System.out.println(rsp.type() + ":" + rsp.target());
|
||||
}
|
||||
else {
|
||||
System.out.println(record.type());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
assertTrue(requests.contains("https://www.marginalia.nu/junk/test.pdf"));
|
||||
assertEquals(requests, responses);
|
||||
|
||||
// Convert the WARC file to a Slop file
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
|
||||
|
||||
CrawledDomain domain = null;
|
||||
Map<String, CrawledDocument> documents = new HashMap<>();
|
||||
|
||||
// Extract the contents of the Slop file
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
|
||||
while (stream.hasNext()) {
|
||||
var doc = stream.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
assertNull(domain);
|
||||
domain = dr;
|
||||
}
|
||||
else if (doc instanceof CrawledDocument dc) {
|
||||
System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
|
||||
documents.put(dc.url, dc);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// Verify we have a PDF in the Slop file
|
||||
assertNotNull(domain);
|
||||
var pdfDoc = documents.get("https://www.marginalia.nu/junk/test.pdf");
|
||||
assertNotNull(pdfDoc);
|
||||
assertEquals("https://www.marginalia.nu/junk/test.pdf", pdfDoc.url);
|
||||
assertEquals(206, pdfDoc.httpStatus);
|
||||
assertTrue(pdfDoc.documentBodyBytes.length > 100);
|
||||
}
|
||||
finally {
|
||||
if (tempFile != null)
|
||||
Files.deleteIfExists(tempFile);
|
||||
if (slopFile != null)
|
||||
Files.deleteIfExists(slopFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWarcOutputNoKnownUrls() throws IOException {
|
||||
var specs = CrawlerMain.CrawlSpecRecord
|
||||
@@ -180,7 +259,7 @@ class CrawlerRetreiverTest {
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc2, new BasicCookieStore())
|
||||
new WarcRecorder(tempFileWarc2)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@@ -456,7 +535,7 @@ class CrawlerRetreiverTest {
|
||||
List.of(), 100);
|
||||
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc3, new BasicCookieStore())
|
||||
new WarcRecorder(tempFileWarc3)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@@ -507,7 +586,7 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
|
||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2, new BasicCookieStore());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2);
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
||||
@@ -519,7 +598,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
@NotNull
|
||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1, new BasicCookieStore());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1);
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||
|
@@ -53,6 +53,8 @@ dependencies {
|
||||
implementation libs.commons.compress
|
||||
implementation libs.commons.codec
|
||||
implementation libs.jsoup
|
||||
implementation libs.slop
|
||||
implementation libs.jwarc
|
||||
|
||||
|
||||
|
||||
|
@@ -1,13 +1,18 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import nu.marginalia.slop.SlopTablePacker;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.compress.utils.IOUtils;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@@ -16,18 +21,19 @@ import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class SampleDataExporter {
|
||||
private final FileStorageService storageService;
|
||||
private final ProcessHeartbeat processHeartbeat;
|
||||
|
||||
@Inject
|
||||
public SampleDataExporter(FileStorageService storageService) {
|
||||
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
|
||||
this.storageService = storageService;
|
||||
this.processHeartbeat = processHeartbeat;
|
||||
}
|
||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException {
|
||||
|
||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
|
||||
@@ -54,11 +60,6 @@ public class SampleDataExporter {
|
||||
|
||||
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
for (var item : entriesAll) {
|
||||
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
}
|
||||
|
||||
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
@@ -67,14 +68,38 @@ public class SampleDataExporter {
|
||||
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
||||
for (var item : entriesAll) {
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
|
||||
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
|
||||
) {
|
||||
for (var item : hb.wrap("Scanning", entriesAll)) {
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
if (!Files.exists(crawlDataPath)) continue;
|
||||
|
||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||
if (StringUtils.isBlank(ctFilter)) {
|
||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
else /* filter != null */ {
|
||||
Path filteredData = null;
|
||||
try {
|
||||
filteredData = filterEntries(crawlDataPath, ctFilter);
|
||||
addFileToTar(stream, filteredData, item.relPath());
|
||||
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
catch (NoSuchElementException ex) {
|
||||
// Ignore
|
||||
}
|
||||
finally {
|
||||
if (filteredData != null) {
|
||||
Files.deleteIfExists(filteredData);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logWriter.flush();
|
||||
|
||||
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
|
||||
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
|
||||
}
|
||||
@@ -86,6 +111,48 @@ public class SampleDataExporter {
|
||||
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
/** Filters the entries in the crawl data file based on the content type. */
|
||||
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
|
||||
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
||||
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
||||
|
||||
// We may have debris from a previous run, so let's clean it up
|
||||
if (Files.isDirectory(tempDir)) {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
Files.createDirectory(tempDir);
|
||||
|
||||
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
||||
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
||||
@Override
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
|
||||
|| contentType.startsWith("x-marginalia/"); // metadata records
|
||||
}
|
||||
}
|
||||
) {
|
||||
boolean wroteEntry = false;
|
||||
while (reader.hasRemaining()) {
|
||||
var entry = reader.get();
|
||||
writer.write(entry);
|
||||
|
||||
wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
|
||||
}
|
||||
|
||||
if (!wroteEntry) {
|
||||
throw new NoSuchElementException("No relevant entries");
|
||||
}
|
||||
|
||||
SlopTablePacker.packToSlopZip(tempDir, tempFile);
|
||||
}
|
||||
finally {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
|
||||
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
|
||||
entry.setSize(Files.size(file));
|
||||
|
@@ -92,7 +92,7 @@ public class ExportTasksMain extends ProcessMainClass {
|
||||
termFrequencyExporter.export(request.crawlId, request.destId);
|
||||
break;
|
||||
case SAMPLE_DATA:
|
||||
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name);
|
||||
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.ctFilter, request.name);
|
||||
break;
|
||||
case ADJACENCIES:
|
||||
websiteAdjacenciesCalculator.export();
|
||||
|
@@ -16,6 +16,7 @@ public class ExportTaskRequest {
|
||||
public FileStorageId destId;
|
||||
public int size;
|
||||
public String name;
|
||||
public String ctFilter;
|
||||
|
||||
public ExportTaskRequest(Task task) {
|
||||
this.task = task;
|
||||
@@ -42,12 +43,13 @@ public class ExportTaskRequest {
|
||||
return request;
|
||||
}
|
||||
|
||||
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
||||
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, String ctFilter, int size, String name) {
|
||||
ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA);
|
||||
request.crawlId = crawlId;
|
||||
request.destId = destId;
|
||||
request.size = size;
|
||||
request.name = name;
|
||||
request.ctFilter = ctFilter;
|
||||
return request;
|
||||
}
|
||||
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -5,7 +5,7 @@ plugins {
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'gg.jte.gradle' version '3.1.15'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -180,7 +180,7 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
||||
* semantically meaningful codepoints into entity codes */
|
||||
public String displayUrl() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String urlStr = url.toString();
|
||||
String urlStr = url.toDisplayString();
|
||||
for (int i = 0; i < urlStr.length(); i++) {
|
||||
char c = urlStr.charAt(i);
|
||||
|
||||
|
@@ -26,4 +26,10 @@
|
||||
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
|
||||
|
||||
</head>
|
||||
</head>
|
||||
<noscript>
|
||||
<h1>Users of text-based browsers</h1>
|
||||
<p>Consider using the old interface at <a href="https://old-search.marginalia.nu/">https://old-search.marginalia.nu/</a>,
|
||||
as it uses fewer modern CSS tricks, and should work better than the new UI. It's functionally nearly identical, but just renders it using a different layout.</p>
|
||||
<hr>
|
||||
</noscript>
|
@@ -1,9 +1,16 @@
|
||||
This is a bit of a hack!
|
||||
|
||||
This class exists to let tailwind we're using these classes even though they aren't visible in the code,
|
||||
as we sometimes generate classes from Java code!
|
||||
as we sometimes generate classes from Java code or javascript!
|
||||
|
||||
<i class="text-blue-800 bg-blue-50 dark:text-blue-200 dark:bg-blue-950"></i>
|
||||
<i class="text-green-800 bg-green-50 dark:text-green-200 dark:bg-green-950"></i>
|
||||
<i class="text-purple-800 bg-purple-50 dark:text-purple-200 dark:bg-purple-950"></i>
|
||||
<i class="text-blue-950 bg-gray-100 dark:text-blue-50 dark:bg-gray-900"></i>
|
||||
<span class="hover:bg-gray-300 "></span>
|
||||
|
||||
<label class="suggestion group block relative">
|
||||
<input type="radio" name="suggestion" class="peer hidden" checked>
|
||||
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full">
|
||||
</div>
|
||||
</label>
|
@@ -26,7 +26,7 @@
|
||||
|
||||
<!-- Main content -->
|
||||
<main class="flex-1 p-4 max-w-2xl space-y-4">
|
||||
<div class="border dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
|
||||
<div class="flex space-x-3 place-items-baseline">
|
||||
<i class="fa fa-circle-exclamation text-red-800"></i>
|
||||
<div class="grow">${model.errorTitle()}</div>
|
||||
|
@@ -80,10 +80,6 @@
|
||||
<tr><td>rank>50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
|
||||
<tr><td>rank<50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
|
||||
|
||||
<tr><td>count>10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
|
||||
<tr><td>count<10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
|
||||
|
||||
|
||||
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
|
||||
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
|
||||
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>
|
||||
|
@@ -7,13 +7,13 @@
|
||||
|
||||
<form class="flex-1 max-w-2xl" action="/search">
|
||||
<div class="flex">
|
||||
@if (query.isBlank())
|
||||
@if (query != null && query.isBlank())
|
||||
<%-- Add autofocus if the query is blank --%>
|
||||
<input type="text"
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
value="${query}"
|
||||
autofocus
|
||||
placeholder="Search..."
|
||||
placeholder="Search the web!"
|
||||
autocomplete="off"
|
||||
name="query"
|
||||
id="searchInput" />
|
||||
@@ -21,13 +21,13 @@
|
||||
<input type="text"
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
value="${query}"
|
||||
placeholder="Search..."
|
||||
placeholder="Search the web!"
|
||||
autocomplete="off"
|
||||
name="query"
|
||||
id="searchInput" />
|
||||
@endif
|
||||
|
||||
<div id="searchSuggestions" class="text-sm absolute top-2 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-200 rounded-lg shadow-lg hidden"></div>
|
||||
<div aria-hidden="true" id="searchSuggestions" class="text-sm absolute top-3 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-300 rounded-lg shadow-lg hidden"></div>
|
||||
|
||||
<button class="px-4 py-2 bg-margeblue text-white ml-2 rounded whitespace-nowrap active:text-slate-200">
|
||||
<i class="fas fa-search text-sm sm:mr-3"></i>
|
||||
|
@@ -43,13 +43,13 @@ function displaySuggestions(suggestions) {
|
||||
}
|
||||
|
||||
suggestionsContainer.innerHTML = suggestions.map((suggestion, index) => `
|
||||
<div
|
||||
class="suggestion px-4 py-2 cursor-pointer hover:bg-gray-100 ${index === selectedIndex ? 'bg-blue-50' : ''}"
|
||||
data-index="${index}"
|
||||
>
|
||||
${suggestion}
|
||||
</div>
|
||||
`).join('');
|
||||
<label class="suggestion group block relative">
|
||||
<input type="radio" name="suggestion" class="peer hidden" ${index === selectedIndex ? 'checked' : ''}>
|
||||
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full" data-index="${index}">
|
||||
${suggestion}
|
||||
</div>
|
||||
</label>
|
||||
`).join('');
|
||||
|
||||
suggestionsContainer.classList.remove('hidden');
|
||||
|
||||
|
@@ -2,7 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -20,6 +20,6 @@ public class StatusModule extends AbstractModule {
|
||||
bind(String.class)
|
||||
.annotatedWith(Names.named("searchEngineTestQuery"))
|
||||
.toInstance(System.getProperty("status-service.public-query",
|
||||
"https://search.marginalia.nu/search?query=plato&ref=marginalia-automatic-metrics"));
|
||||
"https://marginalia-search.com/search?query=plato&ref=marginalia-automatic-metrics"));
|
||||
}
|
||||
}
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -10,7 +10,8 @@ import static com.google.inject.name.Names.named;
|
||||
|
||||
public class AssistantModule extends AbstractModule {
|
||||
public void configure() {
|
||||
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt"));
|
||||
bind(Path.class).annotatedWith(named("suggestions-file1")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
|
||||
bind(Path.class).annotatedWith(named("suggestions-file2")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions3.txt.gz"));
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
@@ -0,0 +1,465 @@
|
||||
package nu.marginalia.assistant.suggest;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** Unhinged data structure for fast prefix searching.
|
||||
*/
|
||||
public class PrefixSearchStructure {
|
||||
// Core data structures
|
||||
private final HashMap<String, TIntArrayList> prefixIndex; // Short prefix index (up to 8 chars)
|
||||
private final HashMap<String, TIntArrayList> longPrefixIndex; // Long prefix index (9-16 chars)
|
||||
private final ArrayList<String> words; // All words by ID
|
||||
private final TIntArrayList wordScores; // Scores for all words
|
||||
|
||||
// Configuration
|
||||
private static final int SHORT_PREFIX_LENGTH = 8;
|
||||
private static final int MAX_INDEXED_PREFIX_LENGTH = 16;
|
||||
|
||||
public int size() {
|
||||
return words.size();
|
||||
}
|
||||
|
||||
// For sorting efficiency
|
||||
private static class WordScorePair {
|
||||
final String word;
|
||||
final int score;
|
||||
|
||||
WordScorePair(String word, int score) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new PrefixTrie for typeahead search.
|
||||
*/
|
||||
public PrefixSearchStructure() {
|
||||
prefixIndex = new HashMap<>(1024);
|
||||
longPrefixIndex = new HashMap<>(1024);
|
||||
words = new ArrayList<>(1024);
|
||||
wordScores = new TIntArrayList(1024);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a prefix to the index.
|
||||
*/
|
||||
private void indexPrefix(String word, int wordId) {
|
||||
// Index short prefixes
|
||||
for (int i = 1; i <= Math.min(word.length(), SHORT_PREFIX_LENGTH); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
TIntArrayList wordIds = prefixIndex.computeIfAbsent(
|
||||
prefix, k -> new TIntArrayList(16));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
|
||||
// Index longer prefixes
|
||||
for (int i = SHORT_PREFIX_LENGTH + 1; i <= Math.min(word.length(), MAX_INDEXED_PREFIX_LENGTH); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
TIntArrayList wordIds = longPrefixIndex.computeIfAbsent(
|
||||
prefix, k -> new TIntArrayList(8));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
|
||||
// If the word contains spaces, also index by each term for multi-word queries
|
||||
if (word.contains(" ")) {
|
||||
String[] terms = word.split("\\s+");
|
||||
for (String term : terms) {
|
||||
if (term.length() >= 2) {
|
||||
for (int i = 1; i <= Math.min(term.length(), SHORT_PREFIX_LENGTH); i++) {
|
||||
String termPrefix = "t:" + term.substring(0, i);
|
||||
TIntArrayList wordIds = prefixIndex.computeIfAbsent(
|
||||
termPrefix, k -> new TIntArrayList(16));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts a word with its associated score.
|
||||
*/
|
||||
public void insert(String word, int score) {
|
||||
if (word == null || word.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Add to the word list and index
|
||||
int wordId = words.size();
|
||||
words.add(word);
|
||||
wordScores.add(score);
|
||||
indexPrefix(word, wordId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the top k completions for a given prefix.
|
||||
*/
|
||||
public List<ScoredSuggestion> getTopCompletions(String prefix, int k) {
|
||||
if (prefix == null || prefix.isEmpty()) {
|
||||
// Return top k words by score
|
||||
return getTopKWords(k);
|
||||
}
|
||||
|
||||
// Check if this is a term search (t:) - for searching within multi-word items
|
||||
boolean isTermSearch = false;
|
||||
if (prefix.startsWith("t:") && prefix.length() > 2) {
|
||||
isTermSearch = true;
|
||||
prefix = prefix.substring(2);
|
||||
}
|
||||
|
||||
// 1. Fast path for short prefixes
|
||||
if (prefix.length() <= SHORT_PREFIX_LENGTH) {
|
||||
String lookupPrefix = isTermSearch ? "t:" + prefix : prefix;
|
||||
TIntArrayList wordIds = prefixIndex.get(lookupPrefix);
|
||||
if (wordIds != null) {
|
||||
return getTopKFromWordIds(wordIds, k);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Fast path for long prefixes (truncate to MAX_INDEXED_PREFIX_LENGTH)
|
||||
if (prefix.length() > SHORT_PREFIX_LENGTH) {
|
||||
// Try exact match in longPrefixIndex first
|
||||
if (prefix.length() <= MAX_INDEXED_PREFIX_LENGTH) {
|
||||
TIntArrayList wordIds = longPrefixIndex.get(prefix);
|
||||
if (wordIds != null) {
|
||||
return getTopKFromWordIds(wordIds, k);
|
||||
}
|
||||
}
|
||||
|
||||
// If prefix is longer than MAX_INDEXED_PREFIX_LENGTH, truncate and filter
|
||||
if (prefix.length() > MAX_INDEXED_PREFIX_LENGTH) {
|
||||
String truncatedPrefix = prefix.substring(0, MAX_INDEXED_PREFIX_LENGTH);
|
||||
TIntArrayList candidateIds = longPrefixIndex.get(truncatedPrefix);
|
||||
if (candidateIds != null) {
|
||||
// Filter candidates by the full prefix
|
||||
return getFilteredTopKFromWordIds(candidateIds, prefix, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Optimized fallback for long prefixes - use prefix tree for segments
|
||||
List<ScoredSuggestion> results = new ArrayList<>();
|
||||
|
||||
// Handle multi-segment queries by finding candidates from first 8 chars
|
||||
if (prefix.length() > SHORT_PREFIX_LENGTH) {
|
||||
String shortPrefix = prefix.substring(0, Math.min(prefix.length(), SHORT_PREFIX_LENGTH));
|
||||
TIntArrayList candidates = prefixIndex.get(shortPrefix);
|
||||
|
||||
if (candidates != null) {
|
||||
return getFilteredTopKFromWordIds(candidates, prefix, k);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Last resort - optimized binary search in sorted segments
|
||||
return findByBinarySearchPrefix(prefix, k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to get the top k words by score.
|
||||
*/
|
||||
private List<ScoredSuggestion> getTopKWords(int k) {
|
||||
// Create pairs of (score, wordId)
|
||||
int[][] pairs = new int[words.size()][2];
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
pairs[i][0] = wordScores.get(i);
|
||||
pairs[i][1] = i;
|
||||
}
|
||||
|
||||
// Sort by score (descending)
|
||||
Arrays.sort(pairs, (a, b) -> Integer.compare(b[0], a[0]));
|
||||
|
||||
// Take top k
|
||||
List<ScoredSuggestion> results = new ArrayList<>();
|
||||
for (int i = 0; i < Math.min(k, pairs.length); i++) {
|
||||
String word = words.get(pairs[i][1]);
|
||||
int score = pairs[i][0];
|
||||
results.add(new ScoredSuggestion(word, score));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to get the top k words from a list of word IDs.
|
||||
*/
|
||||
private List<ScoredSuggestion> getTopKFromWordIds(TIntArrayList wordIds, int k) {
|
||||
if (wordIds == null || wordIds.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// For small lists, avoid sorting
|
||||
if (wordIds.size() <= k) {
|
||||
List<ScoredSuggestion> results = new ArrayList<>(wordIds.size());
|
||||
int[] ids = wordIds.toArray();
|
||||
for (int wordId : ids) {
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
results.add(new ScoredSuggestion(words.get(wordId), wordScores.get(wordId)));
|
||||
}
|
||||
}
|
||||
results.sort((a, b) -> Integer.compare(b.getScore(), a.getScore()));
|
||||
return results;
|
||||
}
|
||||
|
||||
// For larger lists, use an array-based approach for better performance
|
||||
// Find top k without full sorting
|
||||
int[] topScores = new int[k];
|
||||
int[] topWordIds = new int[k];
|
||||
int[] ids = wordIds.toArray();
|
||||
|
||||
// Initialize with first k elements
|
||||
int filledCount = Math.min(k, ids.length);
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
topWordIds[i] = wordId;
|
||||
topScores[i] = wordScores.get(wordId);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort initial elements
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
for (int j = i + 1; j < filledCount; j++) {
|
||||
if (topScores[j] > topScores[i]) {
|
||||
// Swap scores
|
||||
int tempScore = topScores[i];
|
||||
topScores[i] = topScores[j];
|
||||
topScores[j] = tempScore;
|
||||
|
||||
// Swap word IDs
|
||||
int tempId = topWordIds[i];
|
||||
topWordIds[i] = topWordIds[j];
|
||||
topWordIds[j] = tempId;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining elements
|
||||
int minScore = filledCount > 0 ? topScores[filledCount - 1] : Integer.MIN_VALUE;
|
||||
|
||||
for (int i = k; i < ids.length; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
int score = wordScores.get(wordId);
|
||||
|
||||
if (score > minScore) {
|
||||
// Replace the lowest element
|
||||
topScores[filledCount - 1] = score;
|
||||
topWordIds[filledCount - 1] = wordId;
|
||||
|
||||
// Bubble up the new element
|
||||
for (int j = filledCount - 1; j > 0; j--) {
|
||||
if (topScores[j] > topScores[j - 1]) {
|
||||
// Swap scores
|
||||
int tempScore = topScores[j];
|
||||
topScores[j] = topScores[j - 1];
|
||||
topScores[j - 1] = tempScore;
|
||||
|
||||
// Swap word IDs
|
||||
int tempId = topWordIds[j];
|
||||
topWordIds[j] = topWordIds[j - 1];
|
||||
topWordIds[j - 1] = tempId;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Update min score
|
||||
minScore = topScores[filledCount - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create result list
|
||||
List<ScoredSuggestion> results = new ArrayList<>(filledCount);
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
results.add(new ScoredSuggestion(words.get(topWordIds[i]), topScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use binary search on sorted word segments to efficiently find matches.
|
||||
*/
|
||||
private List<ScoredSuggestion> findByBinarySearchPrefix(String prefix, int k) {
|
||||
// If we have a lot of words, use an optimized segment approach
|
||||
if (words.size() > 1000) {
|
||||
// Divide words into segments for better locality
|
||||
int segmentSize = 1000;
|
||||
int numSegments = (words.size() + segmentSize - 1) / segmentSize;
|
||||
|
||||
// Find matches using binary search within each segment
|
||||
List<WordScorePair> allMatches = new ArrayList<>();
|
||||
for (int segment = 0; segment < numSegments; segment++) {
|
||||
int start = segment * segmentSize;
|
||||
int end = Math.min(start + segmentSize, words.size());
|
||||
|
||||
// Binary search for first potential match
|
||||
int pos = Collections.binarySearch(
|
||||
words.subList(start, end),
|
||||
prefix,
|
||||
(a, b) -> a.compareTo(b)
|
||||
);
|
||||
|
||||
if (pos < 0) {
|
||||
pos = -pos - 1;
|
||||
}
|
||||
|
||||
// Collect all matches
|
||||
for (int i = start + pos; i < end && i < words.size(); i++) {
|
||||
String word = words.get(i);
|
||||
if (word.startsWith(prefix)) {
|
||||
allMatches.add(new WordScorePair(word, wordScores.get(i)));
|
||||
} else if (word.compareTo(prefix) > 0) {
|
||||
break; // Past potential matches
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by score and take top k
|
||||
allMatches.sort((a, b) -> Integer.compare(b.score, a.score));
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, allMatches.size()));
|
||||
for (int i = 0; i < Math.min(k, allMatches.size()); i++) {
|
||||
WordScorePair pair = allMatches.get(i);
|
||||
results.add(new ScoredSuggestion(pair.word, pair.score));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// Fallback for small dictionaries - linear scan but optimized
|
||||
return simpleSearchFallback(prefix, k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimized linear scan - only used for small dictionaries.
|
||||
*/
|
||||
private List<ScoredSuggestion> simpleSearchFallback(String prefix, int k) {
|
||||
// Use primitive arrays for better cache locality
|
||||
int[] matchScores = new int[Math.min(words.size(), 100)]; // Assume we won't find more than 100 matches
|
||||
String[] matchWords = new String[matchScores.length];
|
||||
int matchCount = 0;
|
||||
|
||||
for (int i = 0; i < words.size() && matchCount < matchScores.length; i++) {
|
||||
String word = words.get(i);
|
||||
if (word.startsWith(prefix)) {
|
||||
matchWords[matchCount] = word;
|
||||
matchScores[matchCount] = wordScores.get(i);
|
||||
matchCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort matches by score (in-place for small arrays)
|
||||
for (int i = 0; i < matchCount; i++) {
|
||||
for (int j = i + 1; j < matchCount; j++) {
|
||||
if (matchScores[j] > matchScores[i]) {
|
||||
// Swap scores
|
||||
int tempScore = matchScores[i];
|
||||
matchScores[i] = matchScores[j];
|
||||
matchScores[j] = tempScore;
|
||||
|
||||
// Swap words
|
||||
String tempWord = matchWords[i];
|
||||
matchWords[i] = matchWords[j];
|
||||
matchWords[j] = tempWord;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create results
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
|
||||
for (int i = 0; i < Math.min(k, matchCount); i++) {
|
||||
results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top k words from candidate IDs, filtering by the full prefix.
|
||||
*/
|
||||
private List<ScoredSuggestion> getFilteredTopKFromWordIds(TIntArrayList wordIds, String fullPrefix, int k) {
|
||||
if (wordIds == null || wordIds.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// Make primitive arrays for better performance
|
||||
String[] matchWords = new String[Math.min(wordIds.size(), 1000)];
|
||||
int[] matchScores = new int[matchWords.length];
|
||||
int matchCount = 0;
|
||||
|
||||
int[] ids = wordIds.toArray();
|
||||
for (int i = 0; i < ids.length && matchCount < matchWords.length; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
String word = words.get(wordId);
|
||||
if (word.startsWith(fullPrefix)) {
|
||||
matchWords[matchCount] = word;
|
||||
matchScores[matchCount] = wordScores.get(wordId);
|
||||
matchCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by score (efficient insertion sort for small k)
|
||||
for (int i = 0; i < Math.min(matchCount, k); i++) {
|
||||
int maxPos = i;
|
||||
for (int j = i + 1; j < matchCount; j++) {
|
||||
if (matchScores[j] > matchScores[maxPos]) {
|
||||
maxPos = j;
|
||||
}
|
||||
}
|
||||
if (maxPos != i) {
|
||||
// Swap
|
||||
int tempScore = matchScores[i];
|
||||
matchScores[i] = matchScores[maxPos];
|
||||
matchScores[maxPos] = tempScore;
|
||||
|
||||
String tempWord = matchWords[i];
|
||||
matchWords[i] = matchWords[maxPos];
|
||||
matchWords[maxPos] = tempWord;
|
||||
}
|
||||
}
|
||||
|
||||
// Create result list (only up to k elements)
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
|
||||
for (int i = 0; i < Math.min(k, matchCount); i++) {
|
||||
results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class representing a suggested completion.
|
||||
*/
|
||||
public static class ScoredSuggestion implements Comparable<ScoredSuggestion> {
|
||||
private final String word;
|
||||
private final int score;
|
||||
|
||||
public ScoredSuggestion(String word, int score) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return word;
|
||||
}
|
||||
|
||||
public int getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return word + " (" + score + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull PrefixSearchStructure.ScoredSuggestion o) {
|
||||
return Integer.compare(this.score, o.score);
|
||||
}
|
||||
}
|
||||
}
|
@@ -2,74 +2,89 @@ package nu.marginalia.assistant.suggest;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.functions.math.dict.SpellChecker;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import org.apache.commons.collections4.trie.PatriciaTrie;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.*;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
public class Suggestions {
|
||||
private PatriciaTrie<String> suggestionsTrie = null;
|
||||
private TermFrequencyDict termFrequencyDict = null;
|
||||
private volatile boolean ready = false;
|
||||
private final SpellChecker spellChecker;
|
||||
List<PrefixSearchStructure> searchStructures = new ArrayList<>();
|
||||
|
||||
private volatile boolean ready = false;
|
||||
|
||||
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
||||
private static final Logger logger = LoggerFactory.getLogger(Suggestions.class);
|
||||
|
||||
private static final int MIN_SUGGEST_LENGTH = 3;
|
||||
@Inject
|
||||
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
|
||||
SpellChecker spellChecker,
|
||||
TermFrequencyDict dict
|
||||
public Suggestions(@Named("suggestions-file1") Path suggestionsFile1,
|
||||
@Named("suggestions-file2") Path suggestionsFile2
|
||||
) {
|
||||
this.spellChecker = spellChecker;
|
||||
|
||||
Thread.ofPlatform().start(() -> {
|
||||
suggestionsTrie = loadSuggestions(suggestionsFile);
|
||||
termFrequencyDict = dict;
|
||||
searchStructures.add(loadSuggestions(suggestionsFile1));
|
||||
searchStructures.add(loadSuggestions(suggestionsFile2));
|
||||
ready = true;
|
||||
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
||||
logger.info("Loaded suggestions");
|
||||
});
|
||||
}
|
||||
|
||||
private static PatriciaTrie<String> loadSuggestions(Path file) {
|
||||
private static PrefixSearchStructure loadSuggestions(Path file) {
|
||||
PrefixSearchStructure ret = new PrefixSearchStructure();
|
||||
|
||||
if (!Files.exists(file)) {
|
||||
logger.error("Suggestions file {} absent, loading empty suggestions db", file);
|
||||
return new PatriciaTrie<>();
|
||||
return ret;
|
||||
}
|
||||
try (var lines = Files.lines(file)) {
|
||||
var ret = new PatriciaTrie<String>();
|
||||
|
||||
lines.filter(suggestionPattern.asPredicate())
|
||||
.filter(line -> line.length()<32)
|
||||
.map(String::toLowerCase)
|
||||
.forEach(w -> ret.put(w, w));
|
||||
try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
|
||||
while (scanner.hasNextLine()) {
|
||||
String line = scanner.nextLine().trim();
|
||||
String[] parts = StringUtils.split(line, " ,", 2);
|
||||
if (parts.length != 2) {
|
||||
logger.warn("Invalid suggestion line: {}", line);
|
||||
continue;
|
||||
}
|
||||
int cnt = Integer.parseInt(parts[0]);
|
||||
if (cnt > 1) {
|
||||
String word = parts[1];
|
||||
|
||||
// Add special keywords to the suggestions
|
||||
for (var feature : HtmlFeature.values()) {
|
||||
String keyword = feature.getKeyword();
|
||||
// Remove quotes and trailing periods if this is a CSV
|
||||
if (word.startsWith("\"") && word.endsWith("\"")) {
|
||||
word = word.substring(1, word.length() - 1);
|
||||
}
|
||||
|
||||
ret.put(keyword, keyword);
|
||||
ret.put("-" + keyword, "-" + keyword);
|
||||
// Remove trailing periods
|
||||
while (word.endsWith(".")) {
|
||||
word = word.substring(0, word.length() - 1);
|
||||
}
|
||||
|
||||
// Remove junk items we may have gotten from link extraction
|
||||
if (word.startsWith("click here"))
|
||||
continue;
|
||||
if (word.contains("new window"))
|
||||
continue;
|
||||
if (word.contains("click to"))
|
||||
continue;
|
||||
if (word.startsWith("share "))
|
||||
continue;
|
||||
|
||||
if (word.length() > 3) {
|
||||
ret.insert(word, cnt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to load suggestions file", ex);
|
||||
return new PatriciaTrie<>();
|
||||
return new PrefixSearchStructure();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,96 +98,36 @@ public class Suggestions {
|
||||
|
||||
searchWord = StringUtils.stripStart(searchWord.toLowerCase(), " ");
|
||||
|
||||
return Stream.of(
|
||||
new SuggestionStream("", getSuggestionsForKeyword(count, searchWord)),
|
||||
suggestionsForLastWord(count, searchWord),
|
||||
spellCheckStream(searchWord)
|
||||
)
|
||||
.flatMap(SuggestionsStreamable::stream)
|
||||
.limit(count)
|
||||
.collect(Collectors.toList());
|
||||
return getSuggestionsForKeyword(count, searchWord);
|
||||
}
|
||||
|
||||
private SuggestionsStreamable suggestionsForLastWord(int count, String searchWord) {
|
||||
int sp = searchWord.lastIndexOf(' ');
|
||||
|
||||
if (sp < 0) {
|
||||
return Stream::empty;
|
||||
}
|
||||
|
||||
String prefixString = searchWord.substring(0, sp+1);
|
||||
String suggestString = searchWord.substring(sp+1);
|
||||
|
||||
return new SuggestionStream(prefixString, getSuggestionsForKeyword(count, suggestString));
|
||||
|
||||
}
|
||||
|
||||
private SuggestionsStreamable spellCheckStream(String word) {
|
||||
int start = word.lastIndexOf(' ');
|
||||
String prefix;
|
||||
String corrWord;
|
||||
|
||||
if (start < 0) {
|
||||
corrWord = word;
|
||||
prefix = "";
|
||||
}
|
||||
else {
|
||||
prefix = word.substring(0, start + 1);
|
||||
corrWord = word.substring(start + 1);
|
||||
}
|
||||
|
||||
if (corrWord.length() >= MIN_SUGGEST_LENGTH) {
|
||||
Supplier<Stream<String>> suggestionsLazyEval = () -> spellChecker.correct(corrWord).stream();
|
||||
return new SuggestionStream(prefix, Stream.of(suggestionsLazyEval).flatMap(Supplier::get));
|
||||
}
|
||||
else {
|
||||
return Stream::empty;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
|
||||
public List<String> getSuggestionsForKeyword(int count, String prefix) {
|
||||
if (!ready)
|
||||
return Stream.empty();
|
||||
return List.of();
|
||||
|
||||
if (prefix.length() < MIN_SUGGEST_LENGTH) {
|
||||
return Stream.empty();
|
||||
return List.of();
|
||||
}
|
||||
|
||||
var start = suggestionsTrie.select(prefix);
|
||||
List<PrefixSearchStructure.ScoredSuggestion> resultsAll = new ArrayList<>();
|
||||
|
||||
if (start == null) {
|
||||
return Stream.empty();
|
||||
for (var searchStructure : searchStructures) {
|
||||
resultsAll.addAll(searchStructure.getTopCompletions(prefix, count));
|
||||
}
|
||||
resultsAll.sort(Comparator.reverseOrder());
|
||||
List<String> ret = new ArrayList<>(count);
|
||||
|
||||
Set<String> seen = new HashSet<>();
|
||||
for (var result : resultsAll) {
|
||||
if (seen.add(result.getWord())) {
|
||||
ret.add(result.getWord());
|
||||
}
|
||||
if (ret.size() >= count) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!start.getKey().startsWith(prefix)) {
|
||||
return Stream.empty();
|
||||
}
|
||||
|
||||
SuggestionsValueCalculator sv = new SuggestionsValueCalculator();
|
||||
|
||||
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
|
||||
.takeWhile(s -> s.startsWith(prefix))
|
||||
.limit(256)
|
||||
.sorted(Comparator.comparing(sv::get).thenComparing(String::length).thenComparing(Comparator.naturalOrder()))
|
||||
.limit(count);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private record SuggestionStream(String prefix, Stream<String> suggestionStream) implements SuggestionsStreamable {
|
||||
public Stream<String> stream() {
|
||||
return suggestionStream.map(s -> prefix + s);
|
||||
}
|
||||
}
|
||||
|
||||
interface SuggestionsStreamable { Stream<String> stream(); }
|
||||
|
||||
private class SuggestionsValueCalculator {
|
||||
|
||||
private final Map<String, Long> hashCache = new HashMap<>(512);
|
||||
|
||||
public int get(String s) {
|
||||
long hash = hashCache.computeIfAbsent(s, TermFrequencyDict::getStringHash);
|
||||
return -termFrequencyDict.getTermFreqHash(hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -59,9 +59,14 @@ public class ControlMain extends MainClass {
|
||||
download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt"));
|
||||
}
|
||||
|
||||
Path suggestionsFile = dataPath.resolve("suggestions.txt");
|
||||
Path suggestionsFile = dataPath.resolve("suggestions2.txt.gz");
|
||||
if (!Files.exists(suggestionsFile)) {
|
||||
downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz"));
|
||||
download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
|
||||
}
|
||||
|
||||
Path altSuggestionsFile = dataPath.resolve("suggestions3.txt.gz");
|
||||
if (!Files.exists(altSuggestionsFile)) {
|
||||
download(altSuggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions3.txt.gz"));
|
||||
}
|
||||
|
||||
Path asnRawData = dataPath.resolve("asn-data-raw-table");
|
||||
|
@@ -321,9 +321,10 @@ public class ControlNodeActionsService {
|
||||
private Object exportSampleData(Request req, Response rsp) {
|
||||
FileStorageId source = parseSourceFileStorageId(req.queryParams("source"));
|
||||
int size = Integer.parseInt(req.queryParams("size"));
|
||||
String ctFilter = req.queryParams("ctFilter");
|
||||
String name = req.queryParams("name");
|
||||
|
||||
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, name);
|
||||
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, ctFilter, name);
|
||||
|
||||
return "";
|
||||
}
|
||||
|
@@ -24,25 +24,25 @@ This is a sample of real crawl data. It is intended for demo, testing and devel
|
||||
<tr>
|
||||
<td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-s">Small</label></td>
|
||||
<td>1000 Domains. About 2 GB. </td>
|
||||
<td>1000 Domains. About 1 GB. </td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-m">Medium</label></td>
|
||||
<td>2000 Domains. About 6 GB. Recommended.</td>
|
||||
<td>2000 Domains. About 2 GB. Recommended.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-l">Large</label></td>
|
||||
<td>5000 Domains. About 20 GB.</td>
|
||||
<td>5000 Domains. About 7 GB.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-xl">Huge</label></td>
|
||||
<td>50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments.
|
||||
<td>50,000 Domains. Around 80 GB. Primarily intended for pre-production like testing environments.
|
||||
Expect hours of processing time. </td>
|
||||
</tr>
|
||||
</table>
|
||||
|
@@ -35,6 +35,11 @@
|
||||
<div><input type="text" name="size" id="size" pattern="\d+" /></div>
|
||||
<small class="text-muted">How many domains to include in the sample set</small>
|
||||
</div>
|
||||
<div class="mb-3">
|
||||
<label for="ctFilter">Content Type Filter</label>
|
||||
<div><input type="text" name="ctFilter" id="ctFilter" /></div>
|
||||
<small class="text-muted">If set, includes only documents with the specified content type value</small>
|
||||
</div>
|
||||
<div class="mb-3">
|
||||
<label for="name">Name</label>
|
||||
<div><input type="text" name="name" id="name" /></div>
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@@ -43,7 +44,6 @@ import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.test.IntegrationTestModule;
|
||||
import nu.marginalia.test.TestUtil;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -121,11 +121,12 @@ public class IntegrationTest {
|
||||
public void run() throws Exception {
|
||||
|
||||
/** CREATE WARC */
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new BasicCookieStore())) {
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
||||
|
||||
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
|
||||
new DomainCookies(),
|
||||
"text/html", 200,
|
||||
"""
|
||||
<html>
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -1,4 +1,9 @@
|
||||
## This is a token file for automatic deployment
|
||||
## This is a token file for triggering automatic deployment when no commit is made.
|
||||
|
||||
2025-01-08: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
||||
2025-04-24: Deploy executor.
|
||||
2025-04-24: Deploy assistant.
|
||||
2025-05-04: Deploy qs, search and api-services.
|
||||
2025-05-05: Deploy executor partition 4.
|
||||
2025-05-05: Deploy control.
|
||||
|
2
gradle/wrapper/gradle-wrapper.properties
vendored
2
gradle/wrapper/gradle-wrapper.properties
vendored
@@ -1,5 +1,5 @@
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
@@ -314,6 +314,13 @@ if __name__ == '__main__':
|
||||
deploy_tier=0,
|
||||
groups={"all", "core"}
|
||||
),
|
||||
'status': ServiceConfig(
|
||||
gradle_target=':code:services-application:status-service:docker',
|
||||
docker_name='status-service',
|
||||
instances=None,
|
||||
deploy_tier=4,
|
||||
groups={"all"}
|
||||
),
|
||||
'query': ServiceConfig(
|
||||
gradle_target=':code:services-core:query-service:docker',
|
||||
docker_name='query-service',
|
||||
|
Reference in New Issue
Block a user