mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
8 Commits
deploy-006
...
deploy-006
Author | SHA1 | Date | |
---|---|---|---|
|
4342e42722 | ||
|
bc818056e6 | ||
|
de2feac238 | ||
|
1e770205a5 | ||
|
e44ecd6d69 | ||
|
5b93a0e633 | ||
|
08fb0e5efe | ||
|
bcf67782ea |
39
ROADMAP.md
39
ROADMAP.md
@@ -1,4 +1,4 @@
|
|||||||
# Roadmap 2024-2025
|
# Roadmap 2025
|
||||||
|
|
||||||
This is a roadmap with major features planned for Marginalia Search.
|
This is a roadmap with major features planned for Marginalia Search.
|
||||||
|
|
||||||
@@ -30,12 +30,6 @@ Retaining the ability to independently crawl the web is still strongly desirable
|
|||||||
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||||
|
|
||||||
## Web Design Overhaul
|
|
||||||
|
|
||||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
|
||||||
|
|
||||||
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
|
||||||
|
|
||||||
## Additional Language Support
|
## Additional Language Support
|
||||||
|
|
||||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||||
@@ -62,8 +56,31 @@ filter for any API consumer.
|
|||||||
|
|
||||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||||
|
|
||||||
|
## Show favicons next to search results
|
||||||
|
|
||||||
|
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||||
|
|
||||||
|
## Specialized crawler for github
|
||||||
|
|
||||||
|
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||||
|
|
||||||
# Completed
|
# Completed
|
||||||
|
|
||||||
|
## Web Design Overhaul (COMPLETED 2025-01)
|
||||||
|
|
||||||
|
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||||
|
|
||||||
|
PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127)
|
||||||
|
|
||||||
|
## Finalize RSS support (COMPLETED 2024-11)
|
||||||
|
|
||||||
|
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||||
|
it should be extended to all domains. It would also be interesting to offer search of the
|
||||||
|
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||||
|
main dataset.
|
||||||
|
|
||||||
|
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||||
|
|
||||||
## Proper Position Index (COMPLETED 2024-09)
|
## Proper Position Index (COMPLETED 2024-09)
|
||||||
|
|
||||||
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
||||||
@@ -76,11 +93,3 @@ list, as is the civilized way of doing this.
|
|||||||
|
|
||||||
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||||
|
|
||||||
## Finalize RSS support (COMPLETED 2024-11)
|
|
||||||
|
|
||||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
|
||||||
it should be extended to all domains. It would also be interesting to offer search of the
|
|
||||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
|
||||||
main dataset.
|
|
||||||
|
|
||||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
|
||||||
|
@@ -47,7 +47,7 @@ ext {
|
|||||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||||
dockerImageTag='latest'
|
dockerImageTag='latest'
|
||||||
dockerImageRegistry='marginalia'
|
dockerImageRegistry='marginalia'
|
||||||
jibVersion = '3.4.3'
|
jibVersion = '3.4.4'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
@@ -167,6 +168,19 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
writer.write(warcRequest);
|
writer.write(warcRequest);
|
||||||
|
|
||||||
|
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||||
|
&& inputBuffer.size() < 2048)
|
||||||
|
{
|
||||||
|
// Fast detection and mitigation of crawler traps that respond with slow
|
||||||
|
// small responses, with a high branching factor
|
||||||
|
|
||||||
|
// Note we bail *after* writing the warc records, this will effectively only
|
||||||
|
// prevent link extraction from the document.
|
||||||
|
|
||||||
|
logger.warn("URL {} took too long to fetch and was too small for the effort", requestUri);
|
||||||
|
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||||
|
}
|
||||||
|
|
||||||
return new HttpFetchResult.ResultOk(responseUri,
|
return new HttpFetchResult.ResultOk(responseUri,
|
||||||
response.code(),
|
response.code(),
|
||||||
inputBuffer.headers(),
|
inputBuffer.headers(),
|
||||||
|
@@ -36,7 +36,7 @@
|
|||||||
<div class="text-slate-700 dark:text-white text-sm p-4">
|
<div class="text-slate-700 dark:text-white text-sm p-4">
|
||||||
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
||||||
This is the new design and home of Marginalia Search.
|
This is the new design and home of Marginalia Search.
|
||||||
You can about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||||
<p class="my-4"></p>
|
<p class="my-4"></p>
|
||||||
The old version of Marginalia Search remains available at
|
The old version of Marginalia Search remains available at
|
||||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
||||||
|
@@ -72,11 +72,11 @@ services:
|
|||||||
image: "mariadb:lts"
|
image: "mariadb:lts"
|
||||||
container_name: "mariadb"
|
container_name: "mariadb"
|
||||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:3306:3306/tcp"
|
- "127.0.0.1:3306:3306/tcp"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
@@ -103,11 +103,11 @@ services:
|
|||||||
image: "mariadb:lts"
|
image: "mariadb:lts"
|
||||||
container_name: "mariadb"
|
container_name: "mariadb"
|
||||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:3306:3306/tcp"
|
- "127.0.0.1:3306:3306/tcp"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
@@ -129,11 +129,11 @@ services:
|
|||||||
image: "mariadb:lts"
|
image: "mariadb:lts"
|
||||||
container_name: "mariadb"
|
container_name: "mariadb"
|
||||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:3306:3306/tcp"
|
- "127.0.0.1:3306:3306/tcp"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
@@ -3,11 +3,11 @@ services:
|
|||||||
image: "mariadb:lts"
|
image: "mariadb:lts"
|
||||||
container_name: "mariadb"
|
container_name: "mariadb"
|
||||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:3306:3306/tcp"
|
- "127.0.0.1:3306:3306/tcp"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
Reference in New Issue
Block a user