mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
13 Commits
deploy-005
...
deploy-006
Author | SHA1 | Date | |
---|---|---|---|
|
4342e42722 | ||
|
bc818056e6 | ||
|
de2feac238 | ||
|
1e770205a5 | ||
|
e44ecd6d69 | ||
|
5b93a0e633 | ||
|
08fb0e5efe | ||
|
bcf67782ea | ||
|
ef3f175ede | ||
|
bbe4b5d9fd | ||
|
c67a635103 | ||
|
20b24133fb | ||
|
f2567677e8 |
39
ROADMAP.md
39
ROADMAP.md
@@ -1,4 +1,4 @@
|
||||
# Roadmap 2024-2025
|
||||
# Roadmap 2025
|
||||
|
||||
This is a roadmap with major features planned for Marginalia Search.
|
||||
|
||||
@@ -30,12 +30,6 @@ Retaining the ability to independently crawl the web is still strongly desirable
|
||||
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||
|
||||
## Web Design Overhaul
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
||||
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
||||
|
||||
## Additional Language Support
|
||||
|
||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||
@@ -62,8 +56,31 @@ filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
|
||||
# Completed
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
||||
PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127)
|
||||
|
||||
## Finalize RSS support (COMPLETED 2024-11)
|
||||
|
||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||
it should be extended to all domains. It would also be interesting to offer search of the
|
||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||
main dataset.
|
||||
|
||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||
|
||||
## Proper Position Index (COMPLETED 2024-09)
|
||||
|
||||
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
||||
@@ -76,11 +93,3 @@ list, as is the civilized way of doing this.
|
||||
|
||||
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||
|
||||
## Finalize RSS support (COMPLETED 2024-11)
|
||||
|
||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||
it should be extended to all domains. It would also be interesting to offer search of the
|
||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||
main dataset.
|
||||
|
||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||
|
@@ -47,7 +47,7 @@ ext {
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
jibVersion = '3.4.4'
|
||||
|
||||
}
|
||||
|
||||
|
@@ -21,6 +21,7 @@ import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@Singleton
|
||||
public class IndexClient {
|
||||
@@ -65,7 +66,13 @@ public class IndexClient {
|
||||
totalNumResults.addAndGet(ret.size());
|
||||
return ret;
|
||||
}))
|
||||
.map(CompletableFuture::join)
|
||||
.mapMulti((CompletableFuture<List<RpcDecoratedResultItem>> fut, Consumer<List<RpcDecoratedResultItem>> c) ->{
|
||||
try {
|
||||
c.accept(fut.join());
|
||||
} catch (Exception e) {
|
||||
logger.error("Error while fetching results", e);
|
||||
}
|
||||
})
|
||||
.flatMap(List::stream)
|
||||
.filter(item -> !isBlacklisted(item))
|
||||
.sorted(comparator)
|
||||
|
@@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
|
||||
@@ -167,6 +168,19 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048)
|
||||
{
|
||||
// Fast detection and mitigation of crawler traps that respond with slow
|
||||
// small responses, with a high branching factor
|
||||
|
||||
// Note we bail *after* writing the warc records, this will effectively only
|
||||
// prevent link extraction from the document.
|
||||
|
||||
logger.warn("URL {} took too long to fetch and was too small for the effort", requestUri);
|
||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultOk(responseUri,
|
||||
response.code(),
|
||||
inputBuffer.headers(),
|
||||
|
@@ -84,18 +84,33 @@ public record SearchParameters(WebsiteUrl url,
|
||||
}
|
||||
|
||||
public String renderUrl() {
|
||||
String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d",
|
||||
URLEncoder.encode(query, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(js.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(adtech.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(recent.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8),
|
||||
Boolean.valueOf(newFilter).toString(),
|
||||
page
|
||||
);
|
||||
|
||||
return path;
|
||||
StringBuilder pathBuilder = new StringBuilder("/search?");
|
||||
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
||||
|
||||
if (profile != SearchProfile.NO_FILTER) {
|
||||
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (js != SearchJsParameter.DEFAULT) {
|
||||
pathBuilder.append("&js=").append(URLEncoder.encode(js.value, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (adtech != SearchAdtechParameter.DEFAULT) {
|
||||
pathBuilder.append("&adtech=").append(URLEncoder.encode(adtech.value, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (recent != SearchRecentParameter.DEFAULT) {
|
||||
pathBuilder.append("&recent=").append(URLEncoder.encode(recent.value, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (searchTitle != SearchTitleParameter.DEFAULT) {
|
||||
pathBuilder.append("&searchTitle=").append(URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (page != 1) {
|
||||
pathBuilder.append("&page=").append(page);
|
||||
}
|
||||
if (newFilter) {
|
||||
pathBuilder.append("&newfilter=").append(Boolean.valueOf(newFilter).toString());
|
||||
}
|
||||
|
||||
return pathBuilder.toString();
|
||||
}
|
||||
|
||||
public RpcTemporalBias.Bias temporalBias() {
|
||||
|
@@ -3,27 +3,22 @@ package nu.marginalia.search.command.commands;
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import nu.marginalia.search.JteRenderer;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.search.model.NavbarModel;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class SearchCommand implements SearchCommandInterface {
|
||||
private final SearchOperator searchOperator;
|
||||
private final JteRenderer jteRenderer;
|
||||
|
||||
|
||||
@Inject
|
||||
public SearchCommand(SearchOperator searchOperator,
|
||||
JteRenderer jteRenderer) throws IOException {
|
||||
public SearchCommand(SearchOperator searchOperator){
|
||||
this.searchOperator = searchOperator;
|
||||
this.jteRenderer = jteRenderer;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -36,10 +36,11 @@
|
||||
|
||||
</div>
|
||||
|
||||
@if (filters.showRecentOption.isSet()) <input type="hidden" name="js" value="${filters.removeJsOption.value()}"> @endif
|
||||
@if (filters.reduceAdtechOption.isSet()) <input type="hidden" name="adtech" value="${filters.reduceAdtechOption.value()}"> @endif
|
||||
@if (filters.searchTitleOption.isSet()) <input type="hidden" name="searchTitle" value="${filters.searchTitleOption.value()}"> @endif
|
||||
@if (filters.showRecentOption.isSet()) <input type="hidden" name="recent" value="${filters.showRecentOption.value()}"> @endif
|
||||
|
||||
<input type="hidden" name="js" value="${filters.removeJsOption.value()}">
|
||||
<input type="hidden" name="adtech" value="${filters.reduceAdtechOption.value()}">
|
||||
<input type="hidden" name="searchTitle" value="${filters.searchTitleOption.value()}">
|
||||
<input type="hidden" name="profile" value="${profile}">
|
||||
<input type="hidden" name="recent" value="${filters.showRecentOption.value()}">
|
||||
|
||||
</form>
|
||||
|
@@ -36,7 +36,7 @@
|
||||
<div class="text-slate-700 dark:text-white text-sm p-4">
|
||||
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
||||
This is the new design and home of Marginalia Search.
|
||||
You can about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||
You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||
<p class="my-4"></p>
|
||||
The old version of Marginalia Search remains available at
|
||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
||||
|
@@ -72,11 +72,11 @@ services:
|
||||
image: "mariadb:lts"
|
||||
container_name: "mariadb"
|
||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
ports:
|
||||
- "127.0.0.1:3306:3306/tcp"
|
||||
healthcheck:
|
||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
start_period: 5s
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
|
@@ -103,11 +103,11 @@ services:
|
||||
image: "mariadb:lts"
|
||||
container_name: "mariadb"
|
||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
ports:
|
||||
- "127.0.0.1:3306:3306/tcp"
|
||||
healthcheck:
|
||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
start_period: 5s
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
|
@@ -129,11 +129,11 @@ services:
|
||||
image: "mariadb:lts"
|
||||
container_name: "mariadb"
|
||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
ports:
|
||||
- "127.0.0.1:3306:3306/tcp"
|
||||
healthcheck:
|
||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
start_period: 5s
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
|
@@ -3,11 +3,11 @@ services:
|
||||
image: "mariadb:lts"
|
||||
container_name: "mariadb"
|
||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||
ports:
|
||||
- "127.0.0.1:3306:3306/tcp"
|
||||
healthcheck:
|
||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||
start_period: 5s
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
|
Reference in New Issue
Block a user