1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 17:32:39 +02:00

Compare commits

...

16 Commits

Author SHA1 Message Date
Viktor Lofgren
9dc43d8b4a (sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable. 2025-05-05 12:56:12 +02:00
Viktor Lofgren
83967e3305 (sample-actor) Update the actor export sample actor to not generate empty files when the filter is not applicable. 2025-05-05 12:50:21 +02:00
Viktor Lofgren
4db980a291 (jooby-service) Set an upper limit on the number of worker threads 2025-05-05 12:40:31 +02:00
Viktor Lofgren
089b177868 (deploy) Executor partition 4. 2025-05-05 12:21:27 +02:00
Viktor Lofgren
9c8e9a68d5 (deploy) Executor partition 4. 2025-05-05 12:00:05 +02:00
Viktor Lofgren
413d5cc788 (url, minor) Fix typo in test 2025-05-04 16:28:30 +02:00
Viktor Lofgren
58539b92ac (search) Don't show addresses with URLencoding in the UI 2025-05-04 16:26:39 +02:00
Viktor Lofgren
fe72f16df1 (url) Add additional tests for parameter handling 2025-05-04 16:23:39 +02:00
Viktor Lofgren
b49a244a2e (url) Fix encoding handling of query parameters 2025-05-04 16:18:47 +02:00
Viktor Lofgren
3f0b4c010f (deploy) Fix deploy script to be aware of the status service 2025-05-04 16:14:07 +02:00
Viktor Lofgren
c6e0cd93f7 (status) Fix status service to poll the new domain 2025-05-04 16:11:08 +02:00
Viktor Lofgren
80a7ccb080 Trigger redeploy of qs, search and api 2025-05-04 16:07:28 +02:00
Viktor Lofgren
54dec347c4 (url) Fix urlencoding issues with certain symbols
Optimize the code by adding a simple heuristic for guessing whether we need to repair the URI before we pass it to Java's parser.
2025-05-04 13:39:39 +02:00
Viktor Lofgren
d6ee3f0785 (url) Fix urlencoding issues with certain symbols
The urlencoding logic would consider the need to urlencode on an element basis, which is incorrect.  Even if we urlencode on an element basis, we should either urlencode or not urlencode, never a mix of the two.
2025-05-04 13:08:49 +02:00
Viktor Lofgren
8be88afcf3 (url) Fix urlencoding issues with certain symbols
We also need to apply the fix when performing toString() on the EdgeUrl, the URI class will URLDecode the input.

The change also alters the parseURI method to only run the URLEncode-fixer during parsing if URI doesn't throw an exception.  This bad path is obviously going to be slower, but realistically, most URLs are valid, so it's probably a significant optimization to do it like this.
2025-05-04 12:58:13 +02:00
Viktor Lofgren
0e3c00d3e1 (url) Fix urlencoding issues with certain symbols
Minor fix of issue where url sanitizer would strip some trailing slashes.
2025-05-03 23:58:28 +02:00
8 changed files with 263 additions and 60 deletions

View File

@@ -31,7 +31,7 @@ public class EdgeUrl implements Serializable {
private static URI parseURI(String url) throws URISyntaxException {
try {
return EdgeUriFactory.uriFromString(url);
return EdgeUriFactory.parseURILenient(url);
} catch (URISyntaxException ex) {
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
}
@@ -112,11 +112,32 @@ public class EdgeUrl implements Serializable {
sb.append(port);
}
EdgeUriFactory.urlencodePath(sb, path);
if (param != null) {
EdgeUriFactory.urlencodeQuery(sb, param);
}
return sb.toString();
}
public String toDisplayString() {
StringBuilder sb = new StringBuilder(256);
sb.append(proto);
sb.append("://");
sb.append(domain);
if (port != null) {
sb.append(':');
sb.append(port);
}
sb.append(path);
if (param != null) {
sb.append('?');
sb.append(param);
sb.append('?').append(param);
}
return sb.toString();
@@ -194,16 +215,19 @@ public class EdgeUrl implements Serializable {
}
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
Here on the Internet, standards are like the picture on the box of the frozen pizza,
and what you get is more like what's on the inside, we try to patch things instead,
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
like bad or missing URLEncoding
*/
class EdgeUriFactory {
public static URI uriFromString(String url) throws URISyntaxException {
var s = new StringBuilder();
public static URI parseURILenient(String url) throws URISyntaxException {
if (shouldOmitUrlencodeRepair(url)) {
try {
return new URI(url);
}
catch (URISyntaxException ex) {
// ignore and run the lenient parser
}
}
var s = new StringBuilder(url.length()+8);
int pathIdx = findPathIdx(url);
if (pathIdx < 0) { // url looks like http://marginalia.nu
@@ -218,14 +242,18 @@ class EdgeUriFactory {
int queryIdx = url.indexOf('?');
if (queryIdx < 0) queryIdx = end;
recombinePaths(s, url.substring(pathIdx, queryIdx));
urlencodePath(s, url.substring(pathIdx, queryIdx));
if (queryIdx < end) {
recombineQueryString(s, url.substring(queryIdx + 1, end));
urlencodeQuery(s, url.substring(queryIdx + 1, end));
}
return new URI(s.toString());
}
private static void recombinePaths(StringBuilder sb, String path) {
/** Break apart the path element of an URI into its components, and then
* urlencode any component that needs it, and recombine it into a single
* path element again.
*/
public static void urlencodePath(StringBuilder sb, String path) {
if (path == null || path.isEmpty()) {
return;
}
@@ -236,58 +264,92 @@ class EdgeUriFactory {
return;
}
boolean shouldUrlEncode = false;
for (String pathPart : pathParts) {
if (pathPart.isEmpty()) continue;
if (needsUrlEncode(pathPart)) {
shouldUrlEncode = true;
break;
}
}
for (String pathPart : pathParts) {
if (pathPart.isEmpty()) continue;
if (shouldUrlEncode) {
sb.append('/');
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
} else {
sb.append('/');
sb.append(pathPart);
}
}
if (path.endsWith("/")) {
sb.append('/');
}
}
private static void recombineQueryString(StringBuilder sb, String param) {
/** Break apart the query element of a URI into its components, and then
* urlencode any component that needs it, and recombine it into a single
* query element again.
*/
public static void urlencodeQuery(StringBuilder sb, String param) {
if (param == null || param.isEmpty()) {
return;
}
sb.append('?');
String[] pathParts = StringUtils.split(param, '&');
String[] queryParts = StringUtils.split(param, '&');
boolean shouldUrlEncode = false;
for (String queryPart : queryParts) {
if (queryPart.isEmpty()) continue;
if (needsUrlEncode(queryPart)) {
shouldUrlEncode = true;
break;
}
}
boolean first = true;
for (String pathPart : pathParts) {
if (pathPart.isEmpty()) continue;
for (String queryPart : queryParts) {
if (queryPart.isEmpty()) continue;
if (first) {
sb.append('?');
first = false;
} else {
sb.append('&');
}
if (needsUrlEncode(pathPart)) {
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
if (shouldUrlEncode) {
int idx = queryPart.indexOf('=');
if (idx < 0) {
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
} else {
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
sb.append('=');
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
}
} else {
sb.append(pathPart);
sb.append(queryPart);
}
}
}
/** Test if the url element needs URL encoding.
* <p></p>
* Note we may have been given an already encoded path element,
* so we include % and + in the list of good characters
*/
private static boolean needsUrlEncode(String urlElement) {
static boolean needsUrlEncode(String urlElement) {
for (int i = 0; i < urlElement.length(); i++) {
char c = urlElement.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
if ("-_.~+?=&".indexOf(c) >= 0) continue;
if (isUrlSafe(c)) continue;
if ("+".indexOf(c) >= 0) continue;
if (c == '%' && i + 2 < urlElement.length()) {
char c1 = urlElement.charAt(i + 1);
char c2 = urlElement.charAt(i + 2);
@@ -303,14 +365,90 @@ class EdgeUriFactory {
return false;
}
private static boolean isHexDigit(char c) {
static boolean isUrlSafe(int c) {
if (c >= 'a' && c <= 'z') return true;
if (c >= 'A' && c <= 'Z') return true;
if (c >= '0' && c <= '9') return true;
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
return false;
}
/** Test if the URL is a valid URL that does not need to be
* urlencoded.
* <p></p>
* This is a very simple heuristic test that does not guarantee
* that the URL is valid, but it will identify cases where we
* are fairly certain that the URL does not need encoding,
* so we can skip a bunch of allocations and string operations
* that would otherwise be needed to fix the URL.
*/
static boolean shouldOmitUrlencodeRepair(String url) {
int idx = 0;
final int len = url.length();
// Validate the scheme
while (idx < len - 2) {
char c = url.charAt(idx++);
if (c == ':') break;
if (!isAsciiAlphabetic(c)) return false;
}
if (url.charAt(idx++) != '/') return false;
if (url.charAt(idx++) != '/') return false;
// Validate the authority
while (idx < len) {
char c = url.charAt(idx++);
if (c == '/') break;
if (c == ':') continue;
if (c == '@') continue;
if (!isUrlSafe(c)) return false;
}
// Validate the path
if (idx >= len) return true;
while (idx < len) {
char c = url.charAt(idx++);
if (c == '?') break;
if (c == '/') continue;
if (c == '#') return true;
if (!isUrlSafe(c)) return false;
}
if (idx >= len) return true;
// Validate the query
while (idx < len) {
char c = url.charAt(idx++);
if (c == '&') continue;
if (c == '=') continue;
if (c == '#') return true;
if (!isUrlSafe(c)) return false;
}
return true;
}
private static boolean isAsciiAlphabetic(int c) {
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
private static boolean isHexDigit(int c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}
/** Find the index of the path element in a URL.
* <p></p>
* The path element starts after the scheme and authority part of the URL,
* which is everything up to and including the first slash after the colon.
*/
private static int findPathIdx(String url) throws URISyntaxException {
int colonIdx = url.indexOf(':');
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
throw new URISyntaxException(url, "Lacking protocol");
throw new URISyntaxException(url, "Lacking scheme");
}
return url.indexOf('/', colonIdx + 3);
}

View File

@@ -24,21 +24,66 @@ class EdgeUrlTest {
@Test
void testUriFromString() throws URISyntaxException {
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.uriFromString("https://www.example.com/#heredoc").toString());
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.uriFromString("https://www.example.com/%-sign").toString());
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.uriFromString("https://www.example.com/%22-sign").toString());
Assertions.assertEquals("https://www.example.com/%0A+%22huh%22", EdgeUriFactory.uriFromString("https://www.example.com/\n \"huh\"").toString());
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.uriFromString("https://en.wikipedia.org/wiki/Sámi").toString());
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
// converting it back to a string, we want to ensure there is no changes along the way.
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
}
@Test
void testParms() throws URISyntaxException {
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
}

View File

@@ -122,6 +122,11 @@ public class JoobyService {
// single digit percentage difference since HTML already compresses very well with level = 1.
options.setCompressionLevel(1);
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
// scenario
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
jooby.setServerOptions(options);

View File

@@ -23,6 +23,7 @@ import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.NoSuchElementException;
public class SampleDataExporter {
private final FileStorageService storageService;
@@ -59,12 +60,6 @@ public class SampleDataExporter {
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
for (var item : entriesAll) {
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
}
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
@@ -72,24 +67,30 @@ public class SampleDataExporter {
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING));
var logWriter = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)
) {
for (var item : entriesAll) {
Path crawlDataPath = inputDir.resolve(item.relPath());
if (!Files.exists(crawlDataPath)) continue;
if (StringUtils.isBlank(ctFilter)) {
addFileToTar(stream, crawlDataPath, item.relPath());
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
else /* filter != null */ {
boolean didFilterData = false;
Path filteredData = null;
try {
crawlDataPath = filterEntries(crawlDataPath, ctFilter);
didFilterData = true;
addFileToTar(stream, crawlDataPath, item.relPath());
filteredData = filterEntries(crawlDataPath, ctFilter);
addFileToTar(stream, filteredData, item.relPath());
logWriter.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
}
catch (NoSuchElementException ex) {
// Ignore
}
finally {
if (didFilterData) {
Files.deleteIfExists(crawlDataPath);
if (filteredData != null) {
Files.deleteIfExists(filteredData);
}
}
}
@@ -106,12 +107,8 @@ public class SampleDataExporter {
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
}
/** Filters the entries in the crawl data file based on the content type.
* @param crawlDataPath The path to the crawl data file.
* @param contentTypeFilter The content type to filter by.
* @return The path to the filtered crawl data file, or null if an error occurred.
*/
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException {
/** Filters the entries in the crawl data file based on the content type. */
private Path filterEntries(Path crawlDataPath, String contentTypeFilter) throws IOException, NoSuchElementException {
Path tempDir = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered");
Path tempFile = crawlDataPath.resolveSibling(crawlDataPath.getFileName() + ".filtered.slop.zip");
@@ -132,8 +129,16 @@ public class SampleDataExporter {
}
}
) {
boolean wroteEntry = false;
while (reader.hasRemaining()) {
writer.write(reader.get());
var entry = reader.get();
writer.write(entry);
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
}
if (!wroteEntry) {
throw new NoSuchElementException("No relevant entries");
}
SlopTablePacker.packToSlopZip(tempDir, tempFile);

View File

@@ -180,7 +180,7 @@ public class UrlDetails implements Comparable<UrlDetails> {
* semantically meaningful codepoints into entity codes */
public String displayUrl() {
StringBuilder sb = new StringBuilder();
String urlStr = url.toString();
String urlStr = url.toDisplayString();
for (int i = 0; i < urlStr.length(); i++) {
char c = urlStr.charAt(i);

View File

@@ -20,6 +20,6 @@ public class StatusModule extends AbstractModule {
bind(String.class)
.annotatedWith(Names.named("searchEngineTestQuery"))
.toInstance(System.getProperty("status-service.public-query",
"https://search.marginalia.nu/search?query=plato&ref=marginalia-automatic-metrics"));
"https://marginalia-search.com/search?query=plato&ref=marginalia-automatic-metrics"));
}
}

View File

@@ -4,3 +4,6 @@
2025-01-07: Deploy executor.
2025-04-24: Deploy executor.
2025-04-24: Deploy assistant.
2025-05-04: Deploy qs, search and api-services.
2025-05-05: Deploy executor partition 4.
2025-05-05: Deploy control.

View File

@@ -314,6 +314,13 @@ if __name__ == '__main__':
deploy_tier=0,
groups={"all", "core"}
),
'status': ServiceConfig(
gradle_target=':code:services-application:status-service:docker',
docker_name='status-service',
instances=None,
deploy_tier=4,
groups={"all"}
),
'query': ServiceConfig(
gradle_target=':code:services-core:query-service:docker',
docker_name='query-service',