mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
28 Commits
deploy-015
...
deploy-018
Author | SHA1 | Date | |
---|---|---|---|
|
328fb5d927 | ||
|
5e2b63473e | ||
|
f9590703f1 | ||
|
f12fc11337 | ||
|
c309030184 | ||
|
fd5af01629 | ||
|
d4c43c7a79 | ||
|
18700e1919 | ||
|
120b431998 | ||
|
71dad99326 | ||
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 | ||
|
413d5cc788 | ||
|
58539b92ac | ||
|
fe72f16df1 | ||
|
b49a244a2e | ||
|
3f0b4c010f | ||
|
c6e0cd93f7 | ||
|
80a7ccb080 | ||
|
54dec347c4 | ||
|
d6ee3f0785 |
@@ -31,21 +31,9 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
private static URI parseURI(String url) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(url);
|
||||
} catch (URISyntaxException _) {
|
||||
try {
|
||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
||||
|
||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
||||
and what you get is more like what's on the inside, we try to patch things instead,
|
||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
||||
like bad or missing URLEncoding
|
||||
*/
|
||||
return EdgeUriFactory.parseURILenient(url);
|
||||
}
|
||||
catch (URISyntaxException ex2) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex2.getMessage());
|
||||
}
|
||||
return EdgeUriFactory.parseURILenient(url);
|
||||
} catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,6 +121,28 @@ public class EdgeUrl implements Serializable {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public String toDisplayString() {
|
||||
StringBuilder sb = new StringBuilder(256);
|
||||
|
||||
sb.append(proto);
|
||||
sb.append("://");
|
||||
sb.append(domain);
|
||||
|
||||
if (port != null) {
|
||||
sb.append(':');
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
sb.append(path);
|
||||
|
||||
if (param != null) {
|
||||
sb.append('?').append(param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String dir() {
|
||||
return path.replaceAll("/[^/]+$", "/");
|
||||
}
|
||||
@@ -207,7 +217,17 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
class EdgeUriFactory {
|
||||
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||
var s = new StringBuilder();
|
||||
|
||||
if (shouldOmitUrlencodeRepair(url)) {
|
||||
try {
|
||||
return new URI(url);
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
// ignore and run the lenient parser
|
||||
}
|
||||
}
|
||||
|
||||
var s = new StringBuilder(url.length()+8);
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
@@ -244,10 +264,20 @@ class EdgeUriFactory {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(pathPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
sb.append('/');
|
||||
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||
} else {
|
||||
@@ -271,9 +301,20 @@ class EdgeUriFactory {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] pathParts = StringUtils.split(param, '&');
|
||||
String[] queryParts = StringUtils.split(param, '&');
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(queryPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
boolean first = true;
|
||||
for (String queryPart : pathParts) {
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (first) {
|
||||
@@ -283,15 +324,21 @@ class EdgeUriFactory {
|
||||
sb.append('&');
|
||||
}
|
||||
|
||||
if (needsUrlEncode(queryPart)) {
|
||||
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||
if (shouldUrlEncode) {
|
||||
int idx = queryPart.indexOf('=');
|
||||
if (idx < 0) {
|
||||
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||
} else {
|
||||
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
|
||||
sb.append('=');
|
||||
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
|
||||
}
|
||||
} else {
|
||||
sb.append(queryPart);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Test if the url element needs URL encoding.
|
||||
* <p></p>
|
||||
* Note we may have been given an already encoded path element,
|
||||
@@ -301,10 +348,8 @@ class EdgeUriFactory {
|
||||
for (int i = 0; i < urlElement.length(); i++) {
|
||||
char c = urlElement.charAt(i);
|
||||
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
if ("-_.~+?=&".indexOf(c) >= 0) continue;
|
||||
if (isUrlSafe(c)) continue;
|
||||
if ("+".indexOf(c) >= 0) continue;
|
||||
if (c == '%' && i + 2 < urlElement.length()) {
|
||||
char c1 = urlElement.charAt(i + 1);
|
||||
char c2 = urlElement.charAt(i + 2);
|
||||
@@ -320,7 +365,78 @@ class EdgeUriFactory {
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean isHexDigit(char c) {
|
||||
|
||||
static boolean isUrlSafe(int c) {
|
||||
if (c >= 'a' && c <= 'z') return true;
|
||||
if (c >= 'A' && c <= 'Z') return true;
|
||||
if (c >= '0' && c <= '9') return true;
|
||||
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Test if the URL is a valid URL that does not need to be
|
||||
* urlencoded.
|
||||
* <p></p>
|
||||
* This is a very simple heuristic test that does not guarantee
|
||||
* that the URL is valid, but it will identify cases where we
|
||||
* are fairly certain that the URL does not need encoding,
|
||||
* so we can skip a bunch of allocations and string operations
|
||||
* that would otherwise be needed to fix the URL.
|
||||
*/
|
||||
static boolean shouldOmitUrlencodeRepair(String url) {
|
||||
int idx = 0;
|
||||
final int len = url.length();
|
||||
|
||||
// Validate the scheme
|
||||
while (idx < len - 2) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == ':') break;
|
||||
if (!isAsciiAlphabetic(c)) return false;
|
||||
}
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
|
||||
// Validate the authority
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '/') break;
|
||||
if (c == ':') continue;
|
||||
if (c == '@') continue;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
// Validate the path
|
||||
if (idx >= len) return true;
|
||||
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '?') break;
|
||||
if (c == '/') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
if (idx >= len) return true;
|
||||
|
||||
// Validate the query
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '&') continue;
|
||||
if (c == '=') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private static boolean isAsciiAlphabetic(int c) {
|
||||
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
private static boolean isHexDigit(int c) {
|
||||
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
|
@@ -43,6 +43,10 @@ class EdgeUrlTest {
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||
@@ -54,16 +58,32 @@ class EdgeUrlTest {
|
||||
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
|
||||
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParms() throws URISyntaxException {
|
||||
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
|
||||
|
||||
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
|
||||
|
||||
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
|
||||
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
|
||||
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
|
||||
}
|
||||
|
@@ -122,6 +122,11 @@ public class JoobyService {
|
||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||
options.setCompressionLevel(1);
|
||||
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -3,11 +3,18 @@
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
||||
@@ -15,6 +22,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
@@ -34,6 +42,7 @@
|
||||
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
|
@@ -1,13 +1,51 @@
|
||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
@@ -36,7 +74,11 @@
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ConsoleInfo"/>
|
||||
<AppenderRef ref="ConsoleWarn"/>
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
|
@@ -1,15 +1,49 @@
|
||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="ConsoleInfo"/>
|
||||
<AppenderRef ref="ConsoleWarn"/>
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -79,9 +79,17 @@ public class SimpleFeedParser {
|
||||
if (!link.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
|
||||
if (tag != null) {
|
||||
link = tag.text();
|
||||
String linkText = tag.text();
|
||||
|
||||
if (linkText.isBlank()) {
|
||||
linkText = tag.attr("href");
|
||||
}
|
||||
|
||||
link = linkText;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ret.add(new ItemData(title, description, link, pubDate));
|
||||
|
@@ -67,8 +67,6 @@ dependencies {
|
||||
testImplementation libs.mockito
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
||||
|
@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
if (retryQueue.remainingCapacity() > 0) {
|
||||
// Sleep a moment to avoid busy looping via the retry queue
|
||||
// in the case when few tasks remain and almost all are ineligible for
|
||||
// immediate restart
|
||||
Thread.sleep(5);
|
||||
}
|
||||
|
||||
pendingCrawlTasks.remove(domain);
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
|
@@ -74,7 +74,7 @@ public class CrawlerRevisitor {
|
||||
|
||||
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
|
||||
// unlikely to produce anything meaningful for us.
|
||||
if (doc.httpStatus != 200)
|
||||
if (doc.httpStatus != 200 && doc.httpStatus != 206)
|
||||
continue;
|
||||
if (!doc.hasBody())
|
||||
continue;
|
||||
|
@@ -58,7 +58,7 @@ public record DocumentWithReference(
|
||||
if (null == doc)
|
||||
return ContentTags.empty();
|
||||
|
||||
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
|
||||
if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
|
||||
return ContentTags.empty();
|
||||
|
||||
String lastmod = doc.getLastModified();
|
||||
|
@@ -1,19 +1,23 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public class ContentTypes {
|
||||
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||
"application/xhtml",
|
||||
"text/html",
|
||||
"text/markdown",
|
||||
"text/x-markdown",
|
||||
"application/pdf",
|
||||
"image/x-icon",
|
||||
"text/plain");
|
||||
|
||||
public static boolean isAccepted(String contentTypeHeader) {
|
||||
String lcHeader = contentTypeHeader.toLowerCase();
|
||||
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
|
||||
for (var type : acceptedContentTypes) {
|
||||
if (lcHeader.startsWith(type)) {
|
||||
if (lcHeader.equals(type)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -21,7 +25,7 @@ public class ContentTypes {
|
||||
}
|
||||
|
||||
public static boolean isBinary(String contentTypeHeader) {
|
||||
String lcHeader = contentTypeHeader.toLowerCase();
|
||||
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
|
||||
return lcHeader.startsWith("application/pdf");
|
||||
}
|
||||
|
||||
|
@@ -158,11 +158,12 @@ public record SlopCrawlDataRecord(String domain,
|
||||
// and is used to store old responses from previous crawls; in this part of the logic
|
||||
// we treat them the same as a normal response
|
||||
|
||||
if (!filterResponse(uaString, response)) {
|
||||
var filterStatus = filterResponse(uaString, response);
|
||||
if (filterStatus.isRejected()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
slopWriter.write(domain, response);
|
||||
slopWriter.write(domain, filterStatus, response);
|
||||
} else if (record instanceof WarcXEntityRefused refused) {
|
||||
slopWriter.write(domain, refused);
|
||||
} else if (record instanceof Warcinfo warcinfo) {
|
||||
@@ -187,25 +188,35 @@ public record SlopCrawlDataRecord(String domain,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sealed interface ResponseFilterResult {
|
||||
default boolean isRejected() { return false; }
|
||||
record Accept() implements ResponseFilterResult {}
|
||||
record AcceptWithContentType(String contentType) implements ResponseFilterResult {}
|
||||
record AcceptIfPlainText(String contentType) implements ResponseFilterResult {}
|
||||
record Reject() implements ResponseFilterResult {
|
||||
@Override
|
||||
public boolean isRejected() { return true; }
|
||||
}
|
||||
}
|
||||
|
||||
/** Return true if the WarcResponse should be excluded from conversion */
|
||||
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||
private static ResponseFilterResult filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||
|
||||
// We don't want to store robots.txt files, as they are not
|
||||
// interesting for the analysis we want to do. This is important
|
||||
// since txt-files in general are interesting, and we don't want to
|
||||
// exclude them as a class.
|
||||
|
||||
if (response.targetURI().getPath().equals("/robots.txt")) {
|
||||
return false;
|
||||
String uriPath = response.targetURI().getPath();
|
||||
if (uriPath.equals("/robots.txt")) {
|
||||
return new ResponseFilterResult.Reject();
|
||||
}
|
||||
|
||||
var headers = response.http().headers();
|
||||
var robotsTags = headers.all("X-Robots-Tag");
|
||||
|
||||
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
||||
return false;
|
||||
return new ResponseFilterResult.Reject();
|
||||
}
|
||||
|
||||
// Strip out responses with content types we aren't interested in
|
||||
@@ -213,15 +224,29 @@ public record SlopCrawlDataRecord(String domain,
|
||||
String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
|
||||
|
||||
if (!ContentTypes.isAccepted(contentType)) {
|
||||
return false;
|
||||
String contentTypeWithoutParams = StringUtils.substringBefore(contentType, ";");
|
||||
|
||||
// Some servers don't understand what a markdown file is
|
||||
if (contentTypeWithoutParams.equals("application/octet-stream")) {
|
||||
if (uriPath.endsWith(".md")) {
|
||||
// This is a markdown file, which we want to keep
|
||||
return new ResponseFilterResult.AcceptIfPlainText("text/markdown");
|
||||
}
|
||||
else if (uriPath.endsWith(".pdf")) {
|
||||
// This is a text file, which we want to keep
|
||||
return new ResponseFilterResult.AcceptWithContentType("application/pdf");
|
||||
}
|
||||
}
|
||||
|
||||
return new ResponseFilterResult.Reject();
|
||||
}
|
||||
|
||||
// If the format is binary, we don't want to translate it if the response is truncated
|
||||
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
|
||||
return false;
|
||||
return new ResponseFilterResult.Reject();
|
||||
}
|
||||
|
||||
return true;
|
||||
return new ResponseFilterResult.Accept();
|
||||
}
|
||||
|
||||
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
|
||||
@@ -277,7 +302,8 @@ public record SlopCrawlDataRecord(String domain,
|
||||
try (var table = new SlopTable(path)) {
|
||||
ShortColumn.Reader statusReader = statusColumn.open(table);
|
||||
while (statusReader.hasRemaining()) {
|
||||
if (statusReader.get() == 200) {
|
||||
int status = statusReader.get();
|
||||
if (status == 200 || status == 206) {
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
@@ -323,7 +349,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
headerColumnWriter.put(record.headers);
|
||||
}
|
||||
|
||||
public void write(String domain, WarcResponse response) throws IOException {
|
||||
public void write(String domain, ResponseFilterResult filterStatus, WarcResponse response) throws IOException {
|
||||
|
||||
HttpFetchResult result = HttpFetchResult.importWarc(response);
|
||||
if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
|
||||
@@ -346,6 +372,21 @@ public record SlopCrawlDataRecord(String domain,
|
||||
contentType = "";
|
||||
}
|
||||
|
||||
switch (filterStatus) {
|
||||
case ResponseFilterResult.AcceptWithContentType(String ct) -> contentType = ct;
|
||||
case ResponseFilterResult.AcceptIfPlainText(String ct) -> {
|
||||
try {
|
||||
// Parse the body as UTF-8
|
||||
new String(bodyBytes, StandardCharsets.UTF_8);
|
||||
contentType = ct;
|
||||
}
|
||||
catch (RuntimeException ex) { // UTF-8 decoding failed
|
||||
return;
|
||||
}
|
||||
}
|
||||
default -> {}
|
||||
}
|
||||
|
||||
boolean hasCookies = false;
|
||||
|
||||
String headersStr;
|
||||
|
@@ -117,6 +117,100 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void verifyFileFormatSupport() throws IOException {
|
||||
List<String> urls = List.of(
|
||||
"https://www.marginalia.nu/junk/test.pdf",
|
||||
"https://www.marginalia.nu/junk/test.md"
|
||||
);
|
||||
|
||||
var specs = CrawlerMain.CrawlSpecRecord
|
||||
.builder()
|
||||
.crawlDepth(5)
|
||||
.domain("www.marginalia.nu")
|
||||
.urls(urls)
|
||||
.build();
|
||||
Path tempFile = null;
|
||||
Path slopFile = null;
|
||||
try {
|
||||
tempFile = Files.createTempFile("crawling-process", "warc");
|
||||
slopFile = Files.createTempFile("crawling-process", ".slop.zip");
|
||||
|
||||
doCrawl(tempFile, specs);
|
||||
|
||||
Set<String> requests = new HashSet<>();
|
||||
Set<String> responses = new HashSet<>();
|
||||
|
||||
// Inspect the WARC file
|
||||
try (var reader = new WarcReader(tempFile)) {
|
||||
reader.forEach(record -> {
|
||||
if (record instanceof WarcRequest req) {
|
||||
requests.add(req.target());
|
||||
System.out.println(req.type() + ":" + req.target());
|
||||
}
|
||||
else if (record instanceof WarcResponse rsp) {
|
||||
responses.add(rsp.target());
|
||||
try {
|
||||
System.out.println(rsp.type() + ":" + rsp.target() + ":" + rsp.http().contentType());
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
else {
|
||||
System.out.println(record.type());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (var url : urls) {
|
||||
assertTrue(requests.contains(url), "Should have requested " + url);
|
||||
}
|
||||
assertEquals(requests, responses);
|
||||
|
||||
// Convert the WARC file to a Slop file
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
|
||||
|
||||
CrawledDomain domain = null;
|
||||
Map<String, CrawledDocument> documents = new HashMap<>();
|
||||
|
||||
// Extract the contents of the Slop file
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
|
||||
while (stream.hasNext()) {
|
||||
var doc = stream.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
assertNull(domain);
|
||||
domain = dr;
|
||||
}
|
||||
else if (doc instanceof CrawledDocument dc) {
|
||||
System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
|
||||
documents.put(dc.url, dc);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
for (var url : urls) {
|
||||
// Verify we have the downloaded files in the Slop file
|
||||
assertNotNull(domain);
|
||||
var fetchedDoc = documents.get(url);
|
||||
assertNotNull(fetchedDoc, "Should have a document for " + url);
|
||||
assertEquals(url, fetchedDoc.url);
|
||||
assertTrue(fetchedDoc.httpStatus == 200 || fetchedDoc.httpStatus == 206, "Should be 200 or 206 for " + url);
|
||||
assertTrue(fetchedDoc.documentBodyBytes.length > 32, "Should have a body for " + url);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
if (tempFile != null)
|
||||
Files.deleteIfExists(tempFile);
|
||||
if (slopFile != null)
|
||||
Files.deleteIfExists(slopFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWarcOutputNoKnownUrls() throws IOException {
|
||||
var specs = CrawlerMain.CrawlSpecRecord
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
@@ -20,17 +21,18 @@ import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class SampleDataExporter {
|
||||
private final FileStorageService storageService;
|
||||
private final ProcessHeartbeat processHeartbeat;
|
||||
|
||||
@Inject
|
||||
public SampleDataExporter(FileStorageService storageService) {
|
||||
public SampleDataExporter(FileStorageService storageService, ProcessHeartbeat processHeartbeat) {
|
||||
this.storageService = storageService;
|
||||
this.processHeartbeat = processHeartbeat;
|
||||
}
|
||||
|
||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name) throws SQLException, IOException {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
@@ -59,12 +61,6 @@ public class SampleDataExporter {
|
||||
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
for (var item : entriesAll) {
|
||||
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
}
|
||||
|
||||
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
|
||||
@@ -72,29 +68,38 @@ public class SampleDataExporter {
|
||||
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
||||
for (var item : entriesAll) {
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
|
||||
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
var hb = processHeartbeat.createAdHocTaskHeartbeat("Generating Sample")
|
||||
) {
|
||||
for (var item : hb.wrap("Scanning", entriesAll)) {
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
if (!Files.exists(crawlDataPath)) continue;
|
||||
|
||||
if (StringUtils.isBlank(ctFilter)) {
|
||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
else /* filter != null */ {
|
||||
boolean didFilterData = false;
|
||||
Path filteredData = null;
|
||||
try {
|
||||
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
|
||||
didFilterData = true;
|
||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||
filteredData = filterEntries(crawlDataPath, ctFilter);
|
||||
addFileToTar(stream, filteredData, item.relPath());
|
||||
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
catch (NoSuchElementException ex) {
|
||||
// Ignore
|
||||
}
|
||||
finally {
|
||||
if (didFilterData) {
|
||||
Files.deleteIfExists(crawlDataPath);
|
||||
if (filteredData != null) {
|
||||
Files.deleteIfExists(filteredData);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logWriter.flush();
|
||||
|
||||
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
|
||||
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
|
||||
}
|
||||
@@ -106,34 +111,44 @@ public class SampleDataExporter {
|
||||
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
/** Filters the entries in the crawl data file based on the content type.
|
||||
* @param crawlDataPath The path to the crawl data file.
|
||||
* @param contentTypeFilter The content type to filter by.
|
||||
* @return The path to the filtered crawl data file, or null if an error occurred.
|
||||
*/
|
||||
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
|
||||
/** Filters the entries in the crawl data file based on the content type. */
|
||||
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
|
||||
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
|
||||
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
|
||||
|
||||
// We may have debris from a previous run, so let's clean it up
|
||||
if (Files.isDirectory(tempDir)) {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
Files.createDirectory(tempDir);
|
||||
|
||||
boolean wroteEntry = false;
|
||||
|
||||
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
||||
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
||||
@Override
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
if (contentTypeFilter.equals(contentType))
|
||||
return true;
|
||||
else if (contentType.startsWith("x-marginalia/"))
|
||||
// This is a metadata entry, typically domain or redirect information
|
||||
// let's keep those to not confuse the consumer of the data, which might
|
||||
// expect at least the domain summary
|
||||
return true;
|
||||
return false;
|
||||
return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
|
||||
|| contentType.startsWith("x-marginalia/"); // metadata records
|
||||
}
|
||||
}
|
||||
) {
|
||||
|
||||
while (reader.hasRemaining()) {
|
||||
writer.write(reader.get());
|
||||
var entry = reader.get();
|
||||
writer.write(entry);
|
||||
|
||||
wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
throw ex;
|
||||
}
|
||||
|
||||
try {
|
||||
if (!wroteEntry) {
|
||||
throw new NoSuchElementException("No relevant entries");
|
||||
}
|
||||
|
||||
SlopTablePacker.packToSlopZip(tempDir, tempFile);
|
||||
|
@@ -180,7 +180,7 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
||||
* semantically meaningful codepoints into entity codes */
|
||||
public String displayUrl() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String urlStr = url.toString();
|
||||
String urlStr = url.toDisplayString();
|
||||
for (int i = 0; i < urlStr.length(); i++) {
|
||||
char c = urlStr.charAt(i);
|
||||
|
||||
|
@@ -20,6 +20,6 @@ public class StatusModule extends AbstractModule {
|
||||
bind(String.class)
|
||||
.annotatedWith(Names.named("searchEngineTestQuery"))
|
||||
.toInstance(System.getProperty("status-service.public-query",
|
||||
"https://search.marginalia.nu/search?query=plato&ref=marginalia-automatic-metrics"));
|
||||
"https://marginalia-search.com/search?query=plato&ref=marginalia-automatic-metrics"));
|
||||
}
|
||||
}
|
||||
|
@@ -3,4 +3,7 @@
|
||||
2025-01-08: Deploy executor.
|
||||
2025-01-07: Deploy executor.
|
||||
2025-04-24: Deploy executor.
|
||||
2025-04-24: Deploy assistant.
|
||||
2025-04-24: Deploy assistant.
|
||||
2025-05-04: Deploy qs, search and api-services.
|
||||
2025-05-05: Deploy executor partition 4.
|
||||
2025-05-05: Deploy control.
|
||||
|
@@ -314,6 +314,13 @@ if __name__ == '__main__':
|
||||
deploy_tier=0,
|
||||
groups={"all", "core"}
|
||||
),
|
||||
'status': ServiceConfig(
|
||||
gradle_target=':code:services-application:status-service:docker',
|
||||
docker_name='status-service',
|
||||
instances=None,
|
||||
deploy_tier=4,
|
||||
groups={"all"}
|
||||
),
|
||||
'query': ServiceConfig(
|
||||
gradle_target=':code:services-core:query-service:docker',
|
||||
docker_name='query-service',
|
||||
|
Reference in New Issue
Block a user