mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
1 Commits
deploy-015
...
deploy-015
Author | SHA1 | Date | |
---|---|---|---|
|
8be88afcf3 |
@@ -31,9 +31,21 @@ public class EdgeUrl implements Serializable {
|
|||||||
|
|
||||||
private static URI parseURI(String url) throws URISyntaxException {
|
private static URI parseURI(String url) throws URISyntaxException {
|
||||||
try {
|
try {
|
||||||
return EdgeUriFactory.uriFromString(url);
|
return new URI(url);
|
||||||
} catch (URISyntaxException ex) {
|
} catch (URISyntaxException _) {
|
||||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
try {
|
||||||
|
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
||||||
|
|
||||||
|
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
||||||
|
and what you get is more like what's on the inside, we try to patch things instead,
|
||||||
|
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
||||||
|
like bad or missing URLEncoding
|
||||||
|
*/
|
||||||
|
return EdgeUriFactory.parseURILenient(url);
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex2) {
|
||||||
|
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex2.getMessage());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -112,11 +124,10 @@ public class EdgeUrl implements Serializable {
|
|||||||
sb.append(port);
|
sb.append(port);
|
||||||
}
|
}
|
||||||
|
|
||||||
sb.append(path);
|
EdgeUriFactory.urlencodePath(sb, path);
|
||||||
|
|
||||||
if (param != null) {
|
if (param != null) {
|
||||||
sb.append('?');
|
EdgeUriFactory.urlencodeQuery(sb, param);
|
||||||
sb.append(param);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
@@ -194,15 +205,8 @@ public class EdgeUrl implements Serializable {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
|
||||||
|
|
||||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
|
||||||
and what you get is more like what's on the inside, we try to patch things instead,
|
|
||||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
|
||||||
like bad or missing URLEncoding
|
|
||||||
*/
|
|
||||||
class EdgeUriFactory {
|
class EdgeUriFactory {
|
||||||
public static URI uriFromString(String url) throws URISyntaxException {
|
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||||
var s = new StringBuilder();
|
var s = new StringBuilder();
|
||||||
|
|
||||||
int pathIdx = findPathIdx(url);
|
int pathIdx = findPathIdx(url);
|
||||||
@@ -218,14 +222,18 @@ class EdgeUriFactory {
|
|||||||
int queryIdx = url.indexOf('?');
|
int queryIdx = url.indexOf('?');
|
||||||
if (queryIdx < 0) queryIdx = end;
|
if (queryIdx < 0) queryIdx = end;
|
||||||
|
|
||||||
recombinePaths(s, url.substring(pathIdx, queryIdx));
|
urlencodePath(s, url.substring(pathIdx, queryIdx));
|
||||||
if (queryIdx < end) {
|
if (queryIdx < end) {
|
||||||
recombineQueryString(s, url.substring(queryIdx + 1, end));
|
urlencodeQuery(s, url.substring(queryIdx + 1, end));
|
||||||
}
|
}
|
||||||
return new URI(s.toString());
|
return new URI(s.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void recombinePaths(StringBuilder sb, String path) {
|
/** Break apart the path element of an URI into its components, and then
|
||||||
|
* urlencode any component that needs it, and recombine it into a single
|
||||||
|
* path element again.
|
||||||
|
*/
|
||||||
|
public static void urlencodePath(StringBuilder sb, String path) {
|
||||||
if (path == null || path.isEmpty()) {
|
if (path == null || path.isEmpty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -241,7 +249,7 @@ class EdgeUriFactory {
|
|||||||
|
|
||||||
if (needsUrlEncode(pathPart)) {
|
if (needsUrlEncode(pathPart)) {
|
||||||
sb.append('/');
|
sb.append('/');
|
||||||
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
|
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||||
} else {
|
} else {
|
||||||
sb.append('/');
|
sb.append('/');
|
||||||
sb.append(pathPart);
|
sb.append(pathPart);
|
||||||
@@ -254,26 +262,31 @@ class EdgeUriFactory {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void recombineQueryString(StringBuilder sb, String param) {
|
/** Break apart the query element of a URI into its components, and then
|
||||||
|
* urlencode any component that needs it, and recombine it into a single
|
||||||
|
* query element again.
|
||||||
|
*/
|
||||||
|
public static void urlencodeQuery(StringBuilder sb, String param) {
|
||||||
if (param == null || param.isEmpty()) {
|
if (param == null || param.isEmpty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
sb.append('?');
|
|
||||||
String[] pathParts = StringUtils.split(param, '&');
|
String[] pathParts = StringUtils.split(param, '&');
|
||||||
boolean first = true;
|
boolean first = true;
|
||||||
for (String pathPart : pathParts) {
|
for (String queryPart : pathParts) {
|
||||||
if (pathPart.isEmpty()) continue;
|
if (queryPart.isEmpty()) continue;
|
||||||
|
|
||||||
if (first) {
|
if (first) {
|
||||||
|
sb.append('?');
|
||||||
first = false;
|
first = false;
|
||||||
} else {
|
} else {
|
||||||
sb.append('&');
|
sb.append('&');
|
||||||
}
|
}
|
||||||
if (needsUrlEncode(pathPart)) {
|
|
||||||
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8));
|
if (needsUrlEncode(queryPart)) {
|
||||||
|
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||||
} else {
|
} else {
|
||||||
sb.append(pathPart);
|
sb.append(queryPart);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -284,7 +297,7 @@ class EdgeUriFactory {
|
|||||||
* Note we may have been given an already encoded path element,
|
* Note we may have been given an already encoded path element,
|
||||||
* so we include % and + in the list of good characters
|
* so we include % and + in the list of good characters
|
||||||
*/
|
*/
|
||||||
private static boolean needsUrlEncode(String urlElement) {
|
static boolean needsUrlEncode(String urlElement) {
|
||||||
for (int i = 0; i < urlElement.length(); i++) {
|
for (int i = 0; i < urlElement.length(); i++) {
|
||||||
char c = urlElement.charAt(i);
|
char c = urlElement.charAt(i);
|
||||||
|
|
||||||
@@ -311,10 +324,15 @@ class EdgeUriFactory {
|
|||||||
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Find the index of the path element in a URL.
|
||||||
|
* <p></p>
|
||||||
|
* The path element starts after the scheme and authority part of the URL,
|
||||||
|
* which is everything up to and including the first slash after the colon.
|
||||||
|
*/
|
||||||
private static int findPathIdx(String url) throws URISyntaxException {
|
private static int findPathIdx(String url) throws URISyntaxException {
|
||||||
int colonIdx = url.indexOf(':');
|
int colonIdx = url.indexOf(':');
|
||||||
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
||||||
throw new URISyntaxException(url, "Lacking protocol");
|
throw new URISyntaxException(url, "Lacking scheme");
|
||||||
}
|
}
|
||||||
return url.indexOf('/', colonIdx + 3);
|
return url.indexOf('/', colonIdx + 3);
|
||||||
}
|
}
|
||||||
|
@@ -24,13 +24,36 @@ class EdgeUrlTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testUriFromString() throws URISyntaxException {
|
void testUriFromString() throws URISyntaxException {
|
||||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.uriFromString("https://www.example.com/").toString());
|
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
|
||||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.uriFromString("https://www.example.com/#heredoc").toString());
|
// converting it back to a string, we want to ensure there is no changes along the way.
|
||||||
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.uriFromString("https://www.example.com/trailingslash/").toString());
|
|
||||||
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.uriFromString("https://www.example.com/%-sign").toString());
|
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
|
||||||
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.uriFromString("https://www.example.com/%22-sign").toString());
|
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
|
||||||
Assertions.assertEquals("https://www.example.com/%0A+%22huh%22", EdgeUriFactory.uriFromString("https://www.example.com/\n \"huh\"").toString());
|
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
|
||||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.uriFromString("https://en.wikipedia.org/wiki/Sámi").toString());
|
|
||||||
|
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||||
|
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||||
|
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
Reference in New Issue
Block a user