mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
31 Commits
deploy-014
...
deploy-017
Author | SHA1 | Date | |
---|---|---|---|
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 | ||
|
413d5cc788 | ||
|
58539b92ac | ||
|
fe72f16df1 | ||
|
b49a244a2e | ||
|
3f0b4c010f | ||
|
c6e0cd93f7 | ||
|
80a7ccb080 | ||
|
54dec347c4 | ||
|
d6ee3f0785 | ||
|
8be88afcf3 | ||
|
0e3c00d3e1 | ||
|
4279a7f1aa | ||
|
251006d4f9 | ||
|
c3e99dc12a | ||
|
aaaa2de022 | ||
|
fc1388422a | ||
|
b07080db16 | ||
|
e9d86dca4a | ||
|
1d693f0efa | ||
|
5874a163dc | ||
|
5ec7a1deab | ||
|
7fea2808ed | ||
|
8da74484f0 | ||
|
923d5a7234 | ||
|
58f88749b8 | ||
|
77f727a5ba | ||
|
667cfb53dc |
@@ -5,7 +5,7 @@ plugins {
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
@@ -47,7 +47,7 @@ ext {
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.4'
|
||||
jibVersion = '3.4.5'
|
||||
}
|
||||
|
||||
idea {
|
||||
|
@@ -1,16 +1,14 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.util.QueryParams;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.Serializable;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class EdgeUrl implements Serializable {
|
||||
public final String proto;
|
||||
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
private static URI parseURI(String url) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(urlencodeFixer(url));
|
||||
return EdgeUriFactory.parseURILenient(url);
|
||||
} catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||
}
|
||||
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
||||
|
||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
||||
|
||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
||||
and what you get is more like what's on the inside, we try to patch things instead,
|
||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
||||
like bad or missing URLEncoding
|
||||
*/
|
||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
||||
var s = new StringBuilder();
|
||||
String goodChars = "&.?:/-;+$#";
|
||||
String hexChars = "0123456789abcdefABCDEF";
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return url + "/";
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
for (int i = pathIdx; i < end; i++) {
|
||||
int c = url.charAt(i);
|
||||
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
s.appendCodePoint(c);
|
||||
} else if (c == '%' && i + 2 < end) {
|
||||
int cn = url.charAt(i + 1);
|
||||
int cnn = url.charAt(i + 2);
|
||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||
s.appendCodePoint(c);
|
||||
} else {
|
||||
s.append("%25");
|
||||
}
|
||||
} else {
|
||||
s.append(String.format("%%%02X", c));
|
||||
}
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
private static int findPathIdx(String url) throws URISyntaxException {
|
||||
int colonIdx = url.indexOf(':');
|
||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking protocol");
|
||||
}
|
||||
return url.indexOf('/', colonIdx + 2);
|
||||
}
|
||||
|
||||
public EdgeUrl(URI URI) {
|
||||
try {
|
||||
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
EdgeUriFactory.urlencodePath(sb, path);
|
||||
|
||||
if (param != null) {
|
||||
EdgeUriFactory.urlencodeQuery(sb, param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public String toDisplayString() {
|
||||
StringBuilder sb = new StringBuilder(256);
|
||||
|
||||
sb.append(proto);
|
||||
sb.append("://");
|
||||
sb.append(domain);
|
||||
|
||||
if (port != null) {
|
||||
sb.append(':');
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
sb.append(path);
|
||||
|
||||
if (param != null) {
|
||||
sb.append('?');
|
||||
sb.append(param);
|
||||
sb.append('?').append(param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class EdgeUriFactory {
|
||||
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||
|
||||
if (shouldOmitUrlencodeRepair(url)) {
|
||||
try {
|
||||
return new URI(url);
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
// ignore and run the lenient parser
|
||||
}
|
||||
}
|
||||
|
||||
var s = new StringBuilder(url.length()+8);
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return new URI(url + "/");
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
int queryIdx = url.indexOf('?');
|
||||
if (queryIdx < 0) queryIdx = end;
|
||||
|
||||
urlencodePath(s, url.substring(pathIdx, queryIdx));
|
||||
if (queryIdx < end) {
|
||||
urlencodeQuery(s, url.substring(queryIdx + 1, end));
|
||||
}
|
||||
return new URI(s.toString());
|
||||
}
|
||||
|
||||
/** Break apart the path element of an URI into its components, and then
|
||||
* urlencode any component that needs it, and recombine it into a single
|
||||
* path element again.
|
||||
*/
|
||||
public static void urlencodePath(StringBuilder sb, String path) {
|
||||
if (path == null || path.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] pathParts = StringUtils.split(path, '/');
|
||||
if (pathParts.length == 0) {
|
||||
sb.append('/');
|
||||
return;
|
||||
}
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(pathPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
sb.append('/');
|
||||
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||
} else {
|
||||
sb.append('/');
|
||||
sb.append(pathPart);
|
||||
}
|
||||
}
|
||||
|
||||
if (path.endsWith("/")) {
|
||||
sb.append('/');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Break apart the query element of a URI into its components, and then
|
||||
* urlencode any component that needs it, and recombine it into a single
|
||||
* query element again.
|
||||
*/
|
||||
public static void urlencodeQuery(StringBuilder sb, String param) {
|
||||
if (param == null || param.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] queryParts = StringUtils.split(param, '&');
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(queryPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
boolean first = true;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (first) {
|
||||
sb.append('?');
|
||||
first = false;
|
||||
} else {
|
||||
sb.append('&');
|
||||
}
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
int idx = queryPart.indexOf('=');
|
||||
if (idx < 0) {
|
||||
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||
} else {
|
||||
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
|
||||
sb.append('=');
|
||||
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
|
||||
}
|
||||
} else {
|
||||
sb.append(queryPart);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Test if the url element needs URL encoding.
|
||||
* <p></p>
|
||||
* Note we may have been given an already encoded path element,
|
||||
* so we include % and + in the list of good characters
|
||||
*/
|
||||
static boolean needsUrlEncode(String urlElement) {
|
||||
for (int i = 0; i < urlElement.length(); i++) {
|
||||
char c = urlElement.charAt(i);
|
||||
|
||||
if (isUrlSafe(c)) continue;
|
||||
if ("+".indexOf(c) >= 0) continue;
|
||||
if (c == '%' && i + 2 < urlElement.length()) {
|
||||
char c1 = urlElement.charAt(i + 1);
|
||||
char c2 = urlElement.charAt(i + 2);
|
||||
if (isHexDigit(c1) && isHexDigit(c2)) {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static boolean isUrlSafe(int c) {
|
||||
if (c >= 'a' && c <= 'z') return true;
|
||||
if (c >= 'A' && c <= 'Z') return true;
|
||||
if (c >= '0' && c <= '9') return true;
|
||||
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Test if the URL is a valid URL that does not need to be
|
||||
* urlencoded.
|
||||
* <p></p>
|
||||
* This is a very simple heuristic test that does not guarantee
|
||||
* that the URL is valid, but it will identify cases where we
|
||||
* are fairly certain that the URL does not need encoding,
|
||||
* so we can skip a bunch of allocations and string operations
|
||||
* that would otherwise be needed to fix the URL.
|
||||
*/
|
||||
static boolean shouldOmitUrlencodeRepair(String url) {
|
||||
int idx = 0;
|
||||
final int len = url.length();
|
||||
|
||||
// Validate the scheme
|
||||
while (idx < len - 2) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == ':') break;
|
||||
if (!isAsciiAlphabetic(c)) return false;
|
||||
}
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
|
||||
// Validate the authority
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '/') break;
|
||||
if (c == ':') continue;
|
||||
if (c == '@') continue;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
// Validate the path
|
||||
if (idx >= len) return true;
|
||||
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '?') break;
|
||||
if (c == '/') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
if (idx >= len) return true;
|
||||
|
||||
// Validate the query
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '&') continue;
|
||||
if (c == '=') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private static boolean isAsciiAlphabetic(int c) {
|
||||
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
private static boolean isHexDigit(int c) {
|
||||
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
/** Find the index of the path element in a URL.
|
||||
* <p></p>
|
||||
* The path element starts after the scheme and authority part of the URL,
|
||||
* which is everything up to and including the first slash after the colon.
|
||||
*/
|
||||
private static int findPathIdx(String url) throws URISyntaxException {
|
||||
int colonIdx = url.indexOf(':');
|
||||
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking scheme");
|
||||
}
|
||||
return url.indexOf('/', colonIdx + 3);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
@@ -21,25 +21,70 @@ class EdgeUrlTest {
|
||||
new EdgeUrl("https://memex.marginalia.nu/#here")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParam() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
}
|
||||
@Test
|
||||
void urlencodeFixer() throws URISyntaxException {
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
||||
void testUriFromString() throws URISyntaxException {
|
||||
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
|
||||
// converting it back to a string, we want to ensure there is no changes along the way.
|
||||
|
||||
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
|
||||
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
|
||||
|
||||
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
|
||||
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
|
||||
|
||||
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
|
||||
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
|
||||
|
||||
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
|
||||
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParms() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
|
||||
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
|
||||
|
||||
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
|
||||
|
||||
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
|
||||
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
|
||||
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
|
||||
}
|
||||
}
|
@@ -59,17 +59,14 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
int lastProgress = this.progress;
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
if (this.progress / 10 != lastProgress / 10) {
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
}
|
||||
|
||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||
@Override
|
||||
|
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
int lastProgress = this.progress;
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
|
||||
if (this.progress / 10 != lastProgress / 10) {
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
|
@@ -122,6 +122,11 @@ public class JoobyService {
|
||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||
options.setCompressionLevel(1);
|
||||
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
||||
.forNode(node)
|
||||
.run(RpcExportSampleData.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setSize(size)
|
||||
.setCtFilter(ctFilter)
|
||||
.setName(name)
|
||||
.build());
|
||||
}
|
||||
|
@@ -100,6 +100,7 @@ message RpcExportSampleData {
|
||||
int64 fileStorageId = 1;
|
||||
int32 size = 2;
|
||||
string name = 3;
|
||||
string ctFilter = 4;
|
||||
}
|
||||
message RpcDownloadSampleData {
|
||||
string sampleSet = 1;
|
||||
|
@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
|
||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
||||
this(crawlId, destId, size, name, -1);
|
||||
public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
|
||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
|
||||
this(crawlId, destId, size, name, ctFilter,-1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
||||
case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||
"crawl-sample-export",
|
||||
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
||||
);
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id(), size, name);
|
||||
yield new Run(crawlId, storage.id(), size, ctFilter, name);
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
|
||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
|
||||
yield new Run(crawlId, destId, size, name, newMsgId);
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
|
||||
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||
}
|
||||
case Run(_, FileStorageId destId, _, _, long msgId) -> {
|
||||
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export RSS/Atom feeds from crawl data";
|
||||
return "Export sample crawl data";
|
||||
}
|
||||
|
||||
@Inject
|
||||
|
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
|
||||
new ExportSampleDataActor.Export(
|
||||
FileStorageId.of(request.getFileStorageId()),
|
||||
request.getSize(),
|
||||
request.getCtFilter(),
|
||||
request.getName()
|
||||
)
|
||||
);
|
||||
|
@@ -229,13 +229,15 @@ public class FeedFetcherService {
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
;
|
||||
|
||||
if (ifModifiedSinceDate != null) {
|
||||
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||
// though since there are certain idiosyncrasies in server implementations,
|
||||
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
} else if (ifModifiedSinceDate != null) {
|
||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||
}
|
||||
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
}
|
||||
|
||||
HttpRequest getRequest = requestBuilder.build();
|
||||
|
||||
|
@@ -264,17 +264,16 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
if (workLog.isJobFinished(crawlSpec.domain))
|
||||
continue;
|
||||
|
||||
var task = new CrawlTask(
|
||||
crawlSpec,
|
||||
anchorTagsSource,
|
||||
outputDir,
|
||||
warcArchiver,
|
||||
domainStateDb,
|
||||
workLog);
|
||||
var task = new CrawlTask(crawlSpec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
|
||||
|
||||
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
|
||||
if (!trySubmitDeferredTask(task)) {
|
||||
// Otherwise add to the taskList for deferred execution
|
||||
|
||||
// Drain the retry queue to the taskList, and try to submit any tasks that are in the retry queue
|
||||
retryQueue.drainTo(taskList);
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
// Then add this new task to the retry queue
|
||||
taskList.add(task);
|
||||
}
|
||||
}
|
||||
|
@@ -19,11 +19,13 @@ public record ContentTags(String etag, String lastMod) {
|
||||
/** Paints the tags onto the request builder. */
|
||||
public void paint(HttpGet request) {
|
||||
|
||||
// Paint the ETag header if present,
|
||||
// otherwise paint the Last-Modified header
|
||||
// (but not both at the same time due to some servers not liking it)
|
||||
|
||||
if (etag != null) {
|
||||
request.addHeader("If-None-Match", etag);
|
||||
}
|
||||
|
||||
if (lastMod != null) {
|
||||
} else if (lastMod != null) {
|
||||
request.addHeader("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
||||
|
@@ -51,6 +51,7 @@ import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
@@ -635,14 +636,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
|
||||
return false;
|
||||
}
|
||||
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
|
||||
return false;
|
||||
}
|
||||
|
||||
return executionCount <= 3;
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -57,6 +57,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
return new ErrorBuffer();
|
||||
}
|
||||
|
||||
Instant start = Instant.now();
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = entity.getContent();
|
||||
@@ -71,8 +72,25 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
finally {
|
||||
// We're required to consume the stream to avoid leaking connections,
|
||||
// but we also don't want to get stuck on slow or malicious connections
|
||||
// forever, so we set a time limit on this phase and call abort() if it's exceeded.
|
||||
try {
|
||||
is.skip(Long.MAX_VALUE);
|
||||
while (is != null) {
|
||||
// Consume some data
|
||||
if (is.skip(65536) == 0) {
|
||||
// Note that skip may return 0 if the stream is empty
|
||||
// or for other unspecified reasons, so we need to check
|
||||
// with read() as well to determine if the stream is done
|
||||
if (is.read() == -1)
|
||||
is = null;
|
||||
}
|
||||
// Check if the time limit has been exceeded
|
||||
else if (Duration.between(start, Instant.now()).compareTo(timeLimit) > 0) {
|
||||
request.abort();
|
||||
is = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Ignore the exception
|
||||
|
@@ -53,6 +53,8 @@ dependencies {
|
||||
implementation libs.commons.compress
|
||||
implementation libs.commons.codec
|
||||
implementation libs.jsoup
|
||||
implementation libs.slop
|
||||
implementation libs.jwarc
|
||||
|
||||
|
||||
|
||||
|
@@ -3,11 +3,15 @@ package nu.marginalia.extractor;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import nu.marginalia.slop.SlopTablePacker;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.compress.utils.IOUtils;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@@ -27,7 +31,7 @@ public class SampleDataExporter {
|
||||
public SampleDataExporter(FileStorageService storageService) {
|
||||
this.storageService = storageService;
|
||||
}
|
||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException {
|
||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
|
||||
@@ -54,11 +58,6 @@ public class SampleDataExporter {
|
||||
|
||||
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
for (var item : entriesAll) {
|
||||
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
}
|
||||
|
||||
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
@@ -67,12 +66,30 @@ public class SampleDataExporter {
|
||||
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
|
||||
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
|
||||
) {
|
||||
for (var item : entriesAll) {
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
if (!Files.exists(crawlDataPath)) continue;
|
||||
|
||||
if (StringUtils.isBlank(ctFilter)) {
|
||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
else /* filter != null */ {
|
||||
Path filteredData = null;
|
||||
try {
|
||||
filteredData = filterEntries(crawlDataPath, ctFilter);
|
||||
addFileToTar(stream, filteredData, item.relPath());
|
||||
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
finally {
|
||||
if (filteredData != null) {
|
||||
Files.deleteIfExists(filteredData);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
|
||||
@@ -86,6 +103,50 @@ public class SampleDataExporter {
|
||||
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
/** Filters the entries in the crawl data file based on the content type. */
|
||||
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
|
||||
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
||||
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
||||
|
||||
Files.createDirectory(tempDir);
|
||||
|
||||
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
||||
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
||||
@Override
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
if (contentTypeFilter.equals(contentType))
|
||||
return true;
|
||||
else if (contentType.startsWith("x-marginalia/"))
|
||||
// This is a metadata entry, typically domain or redirect information
|
||||
// let's keep those to not confuse the consumer of the data, which might
|
||||
// expect at least the domain summary
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
) {
|
||||
boolean wroteEntry = false;
|
||||
while (reader.hasRemaining()) {
|
||||
var entry = reader.get();
|
||||
writer.write(entry);
|
||||
|
||||
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
|
||||
}
|
||||
|
||||
if (!wroteEntry) {
|
||||
throw new IOException("No relevant entries found");
|
||||
}
|
||||
|
||||
SlopTablePacker.packToSlopZip(tempDir, tempFile);
|
||||
}
|
||||
finally {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
|
||||
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
|
||||
entry.setSize(Files.size(file));
|
||||
|
@@ -92,7 +92,7 @@ public class ExportTasksMain extends ProcessMainClass {
|
||||
termFrequencyExporter.export(request.crawlId, request.destId);
|
||||
break;
|
||||
case SAMPLE_DATA:
|
||||
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.name);
|
||||
sampleDataExporter.export(request.crawlId, request.destId, request.size, request.ctFilter, request.name);
|
||||
break;
|
||||
case ADJACENCIES:
|
||||
websiteAdjacenciesCalculator.export();
|
||||
|
@@ -16,6 +16,7 @@ public class ExportTaskRequest {
|
||||
public FileStorageId destId;
|
||||
public int size;
|
||||
public String name;
|
||||
public String ctFilter;
|
||||
|
||||
public ExportTaskRequest(Task task) {
|
||||
this.task = task;
|
||||
@@ -42,12 +43,13 @@ public class ExportTaskRequest {
|
||||
return request;
|
||||
}
|
||||
|
||||
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
||||
public static ExportTaskRequest sampleData(FileStorageId crawlId, FileStorageId destId, String ctFilter, int size, String name) {
|
||||
ExportTaskRequest request = new ExportTaskRequest(Task.SAMPLE_DATA);
|
||||
request.crawlId = crawlId;
|
||||
request.destId = destId;
|
||||
request.size = size;
|
||||
request.name = name;
|
||||
request.ctFilter = ctFilter;
|
||||
return request;
|
||||
}
|
||||
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -5,7 +5,7 @@ plugins {
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'gg.jte.gradle' version '3.1.15'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -180,7 +180,7 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
||||
* semantically meaningful codepoints into entity codes */
|
||||
public String displayUrl() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String urlStr = url.toString();
|
||||
String urlStr = url.toDisplayString();
|
||||
for (int i = 0; i < urlStr.length(); i++) {
|
||||
char c = urlStr.charAt(i);
|
||||
|
||||
|
@@ -27,3 +27,9 @@
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
|
||||
|
||||
</head>
|
||||
<noscript>
|
||||
<h1>Users of text-based browsers</h1>
|
||||
<p>Consider using the old interface at <a href="https://old-search.marginalia.nu/">https://old-search.marginalia.nu/</a>,
|
||||
as it uses fewer modern CSS tricks, and should work better than the new UI. It's functionally nearly identical, but just renders it using a different layout.</p>
|
||||
<hr>
|
||||
</noscript>
|
@@ -26,7 +26,7 @@
|
||||
|
||||
<!-- Main content -->
|
||||
<main class="flex-1 p-4 max-w-2xl space-y-4">
|
||||
<div class="border dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white text-black dark:bg-gray-800 dark:text-white text-m p-4">
|
||||
<div class="flex space-x-3 place-items-baseline">
|
||||
<i class="fa fa-circle-exclamation text-red-800"></i>
|
||||
<div class="grow">${model.errorTitle()}</div>
|
||||
|
@@ -80,10 +80,6 @@
|
||||
<tr><td>rank>50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
|
||||
<tr><td>rank<50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
|
||||
|
||||
<tr><td>count>10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
|
||||
<tr><td>count<10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
|
||||
|
||||
|
||||
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
|
||||
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
|
||||
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>
|
||||
|
@@ -7,7 +7,7 @@
|
||||
|
||||
<form class="flex-1 max-w-2xl" action="/search">
|
||||
<div class="flex">
|
||||
@if (query.isBlank())
|
||||
@if (query != null && query.isBlank())
|
||||
<%-- Add autofocus if the query is blank --%>
|
||||
<input type="text"
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
|
@@ -2,7 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -20,6 +20,6 @@ public class StatusModule extends AbstractModule {
|
||||
bind(String.class)
|
||||
.annotatedWith(Names.named("searchEngineTestQuery"))
|
||||
.toInstance(System.getProperty("status-service.public-query",
|
||||
"https://search.marginalia.nu/search?query=plato&ref=marginalia-automatic-metrics"));
|
||||
"https://marginalia-search.com/search?query=plato&ref=marginalia-automatic-metrics"));
|
||||
}
|
||||
}
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -68,6 +68,12 @@ public class Suggestions {
|
||||
// Remove junk items we may have gotten from link extraction
|
||||
if (word.startsWith("click here"))
|
||||
continue;
|
||||
if (word.contains("new window"))
|
||||
continue;
|
||||
if (word.contains("click to"))
|
||||
continue;
|
||||
if (word.startsWith("share "))
|
||||
continue;
|
||||
|
||||
if (word.length() > 3) {
|
||||
ret.insert(word, cnt);
|
||||
|
@@ -2,7 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -321,9 +321,10 @@ public class ControlNodeActionsService {
|
||||
private Object exportSampleData(Request req, Response rsp) {
|
||||
FileStorageId source = parseSourceFileStorageId(req.queryParams("source"));
|
||||
int size = Integer.parseInt(req.queryParams("size"));
|
||||
String ctFilter = req.queryParams("ctFilter");
|
||||
String name = req.queryParams("name");
|
||||
|
||||
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, name);
|
||||
exportClient.exportSampleData(Integer.parseInt(req.params("id")), source, size, ctFilter, name);
|
||||
|
||||
return "";
|
||||
}
|
||||
|
@@ -35,6 +35,11 @@
|
||||
<div><input type="text" name="size" id="size" pattern="\d+" /></div>
|
||||
<small class="text-muted">How many domains to include in the sample set</small>
|
||||
</div>
|
||||
<div class="mb-3">
|
||||
<label for="ctFilter">Content Type Filter</label>
|
||||
<div><input type="text" name="ctFilter" id="ctFilter" /></div>
|
||||
<small class="text-muted">If set, includes only documents with the specified content type value</small>
|
||||
</div>
|
||||
<div class="mb-3">
|
||||
<label for="name">Name</label>
|
||||
<div><input type="text" name="name" id="name" /></div>
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -1,5 +1,9 @@
|
||||
## This is a token file for automatic deployment
|
||||
## This is a token file for triggering automatic deployment when no commit is made.
|
||||
|
||||
2025-01-08: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
||||
2025-04-24: Deploy executor.
|
||||
2025-04-24: Deploy assistant.
|
||||
2025-05-04: Deploy qs, search and api-services.
|
||||
2025-05-05: Deploy executor partition 4.
|
||||
2025-05-05: Deploy control.
|
||||
|
2
gradle/wrapper/gradle-wrapper.properties
vendored
2
gradle/wrapper/gradle-wrapper.properties
vendored
@@ -1,5 +1,5 @@
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
@@ -314,6 +314,13 @@ if __name__ == '__main__':
|
||||
deploy_tier=0,
|
||||
groups={"all", "core"}
|
||||
),
|
||||
'status': ServiceConfig(
|
||||
gradle_target=':code:services-application:status-service:docker',
|
||||
docker_name='status-service',
|
||||
instances=None,
|
||||
deploy_tier=4,
|
||||
groups={"all"}
|
||||
),
|
||||
'query': ServiceConfig(
|
||||
gradle_target=':code:services-core:query-service:docker',
|
||||
docker_name='query-service',
|
||||
|
Reference in New Issue
Block a user