mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
5 Commits
deploy-000
...
deploy-001
Author | SHA1 | Date | |
---|---|---|---|
|
4bb71b8439 | ||
|
e4a41f7dd1 | ||
|
69ad6287b1 | ||
|
41a59dcf45 | ||
|
94d4d2edb7 |
10
ROADMAP.md
10
ROADMAP.md
@@ -21,7 +21,7 @@ word n-grams known beforehand. This limits the ability to interpret longer quer
|
|||||||
The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
|
The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
|
||||||
list, as is the civilized way of doing this.
|
list, as is the civilized way of doing this.
|
||||||
|
|
||||||
Completed with PR https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99
|
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||||
|
|
||||||
## Hybridize crawler w/ Common Crawl data
|
## Hybridize crawler w/ Common Crawl data
|
||||||
|
|
||||||
@@ -41,6 +41,12 @@ The search engine has a bit of a problem showing spicy content mixed in with the
|
|||||||
to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||||
|
|
||||||
|
## Web Design Overhaul
|
||||||
|
|
||||||
|
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||||
|
|
||||||
|
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
||||||
|
|
||||||
## Additional Language Support
|
## Additional Language Support
|
||||||
|
|
||||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||||
@@ -56,7 +62,7 @@ it should be extended to all domains. It would also be interesting to offer sea
|
|||||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||||
main dataset.
|
main dataset.
|
||||||
|
|
||||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122)
|
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||||
|
|
||||||
## Support for binary formats like PDF
|
## Support for binary formats like PDF
|
||||||
|
|
||||||
|
@@ -59,12 +59,6 @@ public class FeedsClient {
|
|||||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||||
}
|
}
|
||||||
|
|
||||||
public record UpdatedDomain(String domain, List<String> urls) {
|
|
||||||
public UpdatedDomain(RpcUpdatedLinksResponse rsp) {
|
|
||||||
this(rsp.getDomain(), new ArrayList<>(rsp.getUrlList()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||||
public String getFeedDataHash() {
|
public String getFeedDataHash() {
|
||||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||||
|
@@ -46,6 +46,7 @@ message RpcFeed {
|
|||||||
string feedUrl = 3;
|
string feedUrl = 3;
|
||||||
string updated = 4;
|
string updated = 4;
|
||||||
repeated RpcFeedItem items = 5;
|
repeated RpcFeedItem items = 5;
|
||||||
|
int64 fetchTimestamp = 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
message RpcFeedItem {
|
message RpcFeedItem {
|
||||||
|
@@ -12,9 +12,11 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.nio.file.attribute.PosixFileAttributes;
|
||||||
import java.security.MessageDigest;
|
import java.security.MessageDigest;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.Base64;
|
import java.util.Base64;
|
||||||
@@ -209,4 +211,20 @@ public class FeedDb {
|
|||||||
|
|
||||||
reader.getLinksUpdatedSince(since, consumer);
|
reader.getLinksUpdatedSince(since, consumer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Instant getFetchTime() {
|
||||||
|
if (!Files.exists(readerDbPath)) {
|
||||||
|
return Instant.ofEpochMilli(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return Files.readAttributes(readerDbPath, PosixFileAttributes.class)
|
||||||
|
.creationTime()
|
||||||
|
.toInstant();
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.error("Failed to read the creatiom time of {}", readerDbPath);
|
||||||
|
return Instant.ofEpochMilli(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -316,6 +316,8 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||||
try {
|
try {
|
||||||
|
feedData = sanitizeEntities(feedData);
|
||||||
|
|
||||||
List<Item> rawItems = rssReader.read(
|
List<Item> rawItems = rssReader.read(
|
||||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
||||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
||||||
@@ -342,6 +344,32 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
||||||
|
"»", "»",
|
||||||
|
"«", "«",
|
||||||
|
"—", "--",
|
||||||
|
"–", "-",
|
||||||
|
"’", "'",
|
||||||
|
"‘", "'",
|
||||||
|
" ", ""
|
||||||
|
);
|
||||||
|
|
||||||
|
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
||||||
|
* which is unfortunately relatively common. Replace them as far as is possible
|
||||||
|
* with their corresponding characters
|
||||||
|
*/
|
||||||
|
static String sanitizeEntities(String feedData) {
|
||||||
|
String result = feedData;
|
||||||
|
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
||||||
|
result = result.replace(entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle lone ampersands not part of a recognized XML entity
|
||||||
|
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
/** Decide whether to keep URI fragments in the feed items.
|
/** Decide whether to keep URI fragments in the feed items.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* We keep fragments if there are multiple different fragments in the items.
|
* We keep fragments if there are multiple different fragments in the items.
|
||||||
|
@@ -107,8 +107,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void getFeed(RpcDomainId request,
|
public void getFeed(RpcDomainId request,
|
||||||
StreamObserver<RpcFeed> responseObserver)
|
StreamObserver<RpcFeed> responseObserver) {
|
||||||
{
|
|
||||||
if (!feedDb.isEnabled()) {
|
if (!feedDb.isEnabled()) {
|
||||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
||||||
return;
|
return;
|
||||||
@@ -126,7 +125,8 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
.setDomainId(request.getDomainId())
|
.setDomainId(request.getDomainId())
|
||||||
.setDomain(domainName.get().toString())
|
.setDomain(domainName.get().toString())
|
||||||
.setFeedUrl(feedItems.feedUrl())
|
.setFeedUrl(feedItems.feedUrl())
|
||||||
.setUpdated(feedItems.updated());
|
.setUpdated(feedItems.updated())
|
||||||
|
.setFetchTimestamp(feedDb.getFetchTime().toEpochMilli());
|
||||||
|
|
||||||
for (var item : feedItems.items()) {
|
for (var item : feedItems.items()) {
|
||||||
retB.addItemsBuilder()
|
retB.addItemsBuilder()
|
||||||
|
@@ -99,7 +99,9 @@ class FeedFetcherServiceTest extends AbstractModule {
|
|||||||
feedFetcherService.setDeterministic();
|
feedFetcherService.setDeterministic();
|
||||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||||
|
|
||||||
Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty());
|
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
System.out.println(result);
|
||||||
|
Assertions.assertFalse(result.isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Tag("flaky")
|
@Tag("flaky")
|
||||||
|
@@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
public class TestXmlSanitization {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPreservedEntities() {
|
||||||
|
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||||
|
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||||
|
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||||
|
Assertions.assertEquals(""", FeedFetcherService.sanitizeEntities("""));
|
||||||
|
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testStrayAmpersand() {
|
||||||
|
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTranslatedHtmlEntity() {
|
||||||
|
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||||
|
}
|
||||||
|
}
|
@@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||||
WarcRecorder warcRecorder,
|
WarcRecorder warcRecorder,
|
||||||
ContentTags tags) throws RateLimitException {
|
ContentTags tags) throws RateLimitException {
|
||||||
if (tags.isEmpty()) {
|
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
var headBuilder = new Request.Builder().head()
|
var headBuilder = new Request.Builder().head()
|
||||||
.addHeader("User-agent", userAgentString)
|
.addHeader("User-agent", userAgentString)
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
@@ -42,24 +42,24 @@ class ContentTypeProberTest {
|
|||||||
port = r.nextInt(10000) + 8000;
|
port = r.nextInt(10000) + 8000;
|
||||||
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
|
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
|
||||||
|
|
||||||
server.createContext("/html", exchange -> {
|
server.createContext("/html.gz", exchange -> {
|
||||||
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||||
exchange.sendResponseHeaders(200, -1);
|
exchange.sendResponseHeaders(200, -1);
|
||||||
exchange.close();
|
exchange.close();
|
||||||
});
|
});
|
||||||
server.createContext("/redir", exchange -> {
|
server.createContext("/redir.gz", exchange -> {
|
||||||
exchange.getResponseHeaders().add("Location", "/html");
|
exchange.getResponseHeaders().add("Location", "/html.gz");
|
||||||
exchange.sendResponseHeaders(301, -1);
|
exchange.sendResponseHeaders(301, -1);
|
||||||
exchange.close();
|
exchange.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
server.createContext("/bin", exchange -> {
|
server.createContext("/bin.gz", exchange -> {
|
||||||
exchange.getResponseHeaders().add("Content-Type", "application/binary");
|
exchange.getResponseHeaders().add("Content-Type", "application/binary");
|
||||||
exchange.sendResponseHeaders(200, -1);
|
exchange.sendResponseHeaders(200, -1);
|
||||||
exchange.close();
|
exchange.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
server.createContext("/timeout", exchange -> {
|
server.createContext("/timeout.gz", exchange -> {
|
||||||
try {
|
try {
|
||||||
Thread.sleep(15_000);
|
Thread.sleep(15_000);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
@@ -73,10 +73,10 @@ class ContentTypeProberTest {
|
|||||||
|
|
||||||
server.start();
|
server.start();
|
||||||
|
|
||||||
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get();
|
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html.gz").get();
|
||||||
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get();
|
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin.gz").get();
|
||||||
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get();
|
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout.gz").get();
|
||||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get();
|
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
||||||
|
|
||||||
fetcher = new HttpFetcherImpl("test");
|
fetcher = new HttpFetcherImpl("test");
|
||||||
recorder = new WarcRecorder(warcFile);
|
recorder = new WarcRecorder(warcFile);
|
||||||
|
Reference in New Issue
Block a user