mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
3 Commits
deploy-000
...
deploy-001
Author | SHA1 | Date | |
---|---|---|---|
|
e4a41f7dd1 | ||
|
69ad6287b1 | ||
|
41a59dcf45 |
10
ROADMAP.md
10
ROADMAP.md
@@ -21,7 +21,7 @@ word n-grams known beforehand. This limits the ability to interpret longer quer
|
||||
The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
|
||||
list, as is the civilized way of doing this.
|
||||
|
||||
Completed with PR https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99
|
||||
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||
|
||||
## Hybridize crawler w/ Common Crawl data
|
||||
|
||||
@@ -41,6 +41,12 @@ The search engine has a bit of a problem showing spicy content mixed in with the
|
||||
to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||
|
||||
## Web Design Overhaul
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
||||
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
||||
|
||||
## Additional Language Support
|
||||
|
||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||
@@ -56,7 +62,7 @@ it should be extended to all domains. It would also be interesting to offer sea
|
||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||
main dataset.
|
||||
|
||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122)
|
||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||
|
||||
## Support for binary formats like PDF
|
||||
|
||||
|
@@ -316,6 +316,8 @@ public class FeedFetcherService {
|
||||
|
||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||
try {
|
||||
feedData = sanitizeEntities(feedData);
|
||||
|
||||
List<Item> rawItems = rssReader.read(
|
||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
||||
@@ -342,6 +344,32 @@ public class FeedFetcherService {
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
||||
"»", "»",
|
||||
"«", "«",
|
||||
"—", "--",
|
||||
"–", "-",
|
||||
"’", "'",
|
||||
"‘", "'",
|
||||
" ", ""
|
||||
);
|
||||
|
||||
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
||||
* which is unfortunately relatively common. Replace them as far as is possible
|
||||
* with their corresponding characters
|
||||
*/
|
||||
static String sanitizeEntities(String feedData) {
|
||||
String result = feedData;
|
||||
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
||||
result = result.replace(entry.getKey(), entry.getValue());
|
||||
}
|
||||
|
||||
// Handle lone ampersands not part of a recognized XML entity
|
||||
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Decide whether to keep URI fragments in the feed items.
|
||||
* <p></p>
|
||||
* We keep fragments if there are multiple different fragments in the items.
|
||||
|
@@ -99,7 +99,9 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
feedFetcherService.setDeterministic();
|
||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||
|
||||
Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty());
|
||||
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
|
||||
System.out.println(result);
|
||||
Assertions.assertFalse(result.isEmpty());
|
||||
}
|
||||
|
||||
@Tag("flaky")
|
||||
|
@@ -0,0 +1,26 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestXmlSanitization {
|
||||
|
||||
@Test
|
||||
public void testPreservedEntities() {
|
||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||
Assertions.assertEquals(""", FeedFetcherService.sanitizeEntities("""));
|
||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStrayAmpersand() {
|
||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTranslatedHtmlEntity() {
|
||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||
}
|
||||
}
|
@@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
ContentTags tags) throws RateLimitException {
|
||||
if (tags.isEmpty()) {
|
||||
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
var headBuilder = new Request.Builder().head()
|
||||
.addHeader("User-agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
|
Reference in New Issue
Block a user