1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

9 Commits

Author SHA1 Message Date
Viktor Lofgren
b7f0a2a98e (search-service) Fix metrics for errors and request times
This was previously in place, but broke during the jooby migration.
2025-01-08 14:10:43 +01:00
Viktor Lofgren
5fb76b2e79 (search-service) Fix metrics for errors and request times
This was previously in place, but broke during the jooby migration.
2025-01-08 14:06:03 +01:00
Viktor Lofgren
ad8c97f342 (search-service) Begin replacement of the crawl queue mechanism with node_affinity flagging
Previously a special db table was used to hold domains slated for crawling, but this is deprecated, and instead now each domain has a node_affinity flag that decides its indexing state, where a value of -1 indicates it shouldn't be crawled, a value of 0 means it's slated for crawling by the next index partition to be crawled, and a positive value means it's assigned to an index partition.

The change set also adds a test case validating the modified behavior.
2025-01-08 13:25:56 +01:00
Viktor Lofgren
dc1b6373eb (search-service) Clean up readme 2025-01-08 13:04:39 +01:00
Viktor Lofgren
983d6d067c (search-service) Add indexing indicator to sibling domains listing 2025-01-08 12:58:34 +01:00
Viktor Lofgren
a84a06975c (ranking-params) Add disable penalties flag to ranking params
This will help debugging ranking issues.  Later it may be added to some filters.
2025-01-08 00:16:49 +01:00
Viktor Lofgren
d2864c13ec (query-params) Add additional permitted query params 2025-01-07 20:21:44 +01:00
Viktor Lofgren
03ba53ce51 (legacy-search) Update nav bar with correct links 2025-01-07 17:44:52 +01:00
Viktor Lofgren
d4a6684931 (specialization) Soften length requirements for wiki-specialized documents (incl. cppreference) 2025-01-07 15:53:25 +01:00
18 changed files with 200 additions and 33 deletions

View File

@@ -103,11 +103,11 @@ public class DbDomainQueries {
}
}
public List<EdgeDomain> otherSubdomains(EdgeDomain domain, int cnt) {
List<EdgeDomain> ret = new ArrayList<>();
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) {
List<DomainWithNode> ret = new ArrayList<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
stmt.setString(1, domain.topDomain);
stmt.setInt(2, cnt);
@@ -118,7 +118,7 @@ public class DbDomainQueries {
if (sibling.equals(domain))
continue;
ret.add(sibling);
ret.add(new DomainWithNode(sibling, rs.getInt(2)));
}
} catch (SQLException e) {
logger.error("Failed to get domain neighbors");
@@ -126,4 +126,10 @@ public class DbDomainQueries {
return ret;
}
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
public boolean isIndexed() {
return nodeAffinity > 0;
}
}
}

View File

@@ -83,6 +83,11 @@ public class QueryParams {
if (path.endsWith("StoryView.py")) { // folklore.org is neat
return param.startsWith("project=") || param.startsWith("story=");
}
// www.perseus.tufts.edu:
if (param.startsWith("collection=")) return true;
if (param.startsWith("doc=")) return true;
return false;
}
}

View File

@@ -121,6 +121,7 @@ public class IndexProtobufCodec {
params.getTcfProximityWeight(),
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
params.getTemporalBiasWeight(),
params.getDisablePenalties(),
params.getExportDebugData()
);
}
@@ -146,6 +147,7 @@ public class IndexProtobufCodec {
.setTcfProximityWeight(rankingParams.tcfProximity)
.setTcfVerbatimWeight(rankingParams.tcfVerbatim)
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
.setDisablePenalties(rankingParams.disablePenalties)
.setExportDebugData(rankingParams.exportDebugData);
if (temporalBias != null && temporalBias.getBias() != RpcTemporalBias.Bias.NONE) {

View File

@@ -42,12 +42,14 @@ public class ResultRankingParameters {
public double tcfVerbatim;
public double tcfProximity;
public TemporalBias temporalBias;
public double temporalBiasWeight;
public boolean disablePenalties;
public boolean exportDebugData;
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean exportDebugData) {
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean disablePenalties, boolean exportDebugData) {
this.bm25Params = bm25Params;
this.shortDocumentThreshold = shortDocumentThreshold;
this.shortDocumentPenalty = shortDocumentPenalty;
@@ -61,11 +63,11 @@ public class ResultRankingParameters {
this.tcfProximity = tcfProximity;
this.temporalBias = temporalBias;
this.temporalBiasWeight = temporalBiasWeight;
this.disablePenalties = disablePenalties;
this.exportDebugData = exportDebugData;
}
public static ResultRankingParameters sensibleDefaults() {
return builder()
private static final ResultRankingParameters _sensibleDefaults = builder()
.bm25Params(new Bm25Parameters(1.2, 0.5))
.shortDocumentThreshold(2000)
.shortDocumentPenalty(2.)
@@ -80,7 +82,11 @@ public class ResultRankingParameters {
.temporalBias(TemporalBias.NONE)
.temporalBiasWeight(5.0)
.exportDebugData(false)
.disablePenalties(false)
.build();
public static ResultRankingParameters sensibleDefaults() {
return _sensibleDefaults;
}
public static ResultRankingParametersBuilder builder() {
@@ -139,6 +145,8 @@ public class ResultRankingParameters {
return this.temporalBiasWeight;
}
public boolean isDisablePenalties() { return this.disablePenalties; }
public boolean isExportDebugData() {
return this.exportDebugData;
}
@@ -166,6 +174,7 @@ public class ResultRankingParameters {
result = 31 * result + Double.hashCode(tcfProximity);
result = 31 * result + Objects.hashCode(temporalBias);
result = 31 * result + Double.hashCode(temporalBiasWeight);
result = 31 * result + Boolean.hashCode(disablePenalties);
result = 31 * result + Boolean.hashCode(exportDebugData);
return result;
}
@@ -192,6 +201,7 @@ public class ResultRankingParameters {
private double tcfProximity;
private TemporalBias temporalBias;
private double temporalBiasWeight;
private boolean disablePenalties;
private boolean exportDebugData;
ResultRankingParametersBuilder() {
@@ -262,17 +272,20 @@ public class ResultRankingParameters {
return this;
}
public ResultRankingParametersBuilder disablePenalties(boolean disablePenalties) {
this.disablePenalties = disablePenalties;
return this;
}
public ResultRankingParametersBuilder exportDebugData(boolean exportDebugData) {
this.exportDebugData = exportDebugData;
return this;
}
public ResultRankingParameters build() {
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.exportDebugData);
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.disablePenalties, this.exportDebugData);
}
public String toString() {
return "ResultRankingParameters.ResultRankingParametersBuilder(bm25Params=" + this.bm25Params + ", shortDocumentThreshold=" + this.shortDocumentThreshold + ", shortDocumentPenalty=" + this.shortDocumentPenalty + ", domainRankBonus=" + this.domainRankBonus + ", qualityPenalty=" + this.qualityPenalty + ", shortSentenceThreshold=" + this.shortSentenceThreshold + ", shortSentencePenalty=" + this.shortSentencePenalty + ", bm25Weight=" + this.bm25Weight + ", tcfFirstPosition=" + this.tcfFirstPosition + ", tcfVerbatim=" + this.tcfVerbatim + ", tcfProximity=" + this.tcfProximity + ", temporalBias=" + this.temporalBias + ", temporalBiasWeight=" + this.temporalBiasWeight + ", exportDebugData=" + this.exportDebugData + ")";
}
}
}

View File

@@ -162,6 +162,7 @@ message RpcResultRankingParameters {
double temporalBiasWeight = 17;
bool exportDebugData = 18;
bool disablePenalties = 19;
}

View File

@@ -248,6 +248,10 @@ public class IndexResultScoreCalculator {
ResultRankingParameters rankingParams,
@Nullable DebugRankingFactors debugRankingFactors) {
if (rankingParams.disablePenalties) {
return 0.;
}
int rank = DocumentMetadata.decodeRank(documentMetadata);
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
int quality = DocumentMetadata.decodeQuality(documentMetadata);

View File

@@ -3,9 +3,9 @@
<nav>
<a href="#" class="screenreader-only" onClick="">Skip to content</a>
<a href="https://www.marginalia.nu/">Marginalia</a>
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">About</a>
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Donate</a>
<a class="extra" href="https://search.marginalia.nu/explore/random">Random</a>
<a href="https://about.marginalia-search.com/">About</a>
<a href="https://about.marginalia-search.com/article/supporting/">Donate</a>
<a class="extra" href="https://old-search.marginalia.nu/explore/random">Random</a>
</nav>
<div id="theme">
<label for="theme-select" class="screenreader-only">Color Theme</label>

View File

@@ -1,15 +1,14 @@
package nu.marginalia.search;
import com.google.inject.Inject;
import io.jooby.Context;
import io.jooby.Jooby;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.svc.*;
import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.JoobyService;
import nu.marginalia.service.server.StaticResources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -34,8 +33,6 @@ public class SearchService extends JoobyService {
@Inject
public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
StaticResources staticResources,
SearchFrontPageService frontPageService,
SearchAddToCrawlQueueService addToCrawlQueueService,
SearchSiteSubscriptionService siteSubscriptionService,
@@ -62,7 +59,25 @@ public class SearchService extends JoobyService {
public void startJooby(Jooby jooby) {
super.startJooby(jooby);
final String startTimeAttribute = "start-time";
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
jooby.before((Context ctx) -> {
ctx.setAttribute(startTimeAttribute, System.nanoTime());
});
jooby.after((Context ctx, Object result, Throwable failure) -> {
if (failure != null) {
wmsa_search_service_error_count.labels(ctx.getRoute().getPattern(), ctx.getMethod()).inc();
}
else {
Long startTime = ctx.getAttribute(startTimeAttribute);
if (startTime != null) {
wmsa_search_service_request_time.labels(ctx.getRoute().getPattern(), ctx.getMethod())
.observe((System.nanoTime() - startTime) / 1e9);
}
}
});
}

View File

@@ -47,18 +47,23 @@ public class SearchAddToCrawlQueueService {
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domainName));
}
private void addToCrawlQueue(int id) throws SQLException {
/** Mark a domain for crawling by setting node affinity to zero,
* unless it is already marked for crawling, then node affinity should
* be left unchanged.
* */
void addToCrawlQueue(int domainId) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
UPDATE EC_DOMAIN
SET WMSA_prod.EC_DOMAIN.NODE_AFFINITY = 0
WHERE ID=? AND WMSA_prod.EC_DOMAIN.NODE_AFFINITY < 0
""")) {
stmt.setInt(1, id);
stmt.setInt(1, domainId);
stmt.executeUpdate();
}
}
private String getDomainName(int id) {
String getDomainName(int id) {
var domain = domainQueries.getDomain(id);
if (domain.isEmpty())
throw new IllegalArgumentException();

View File

@@ -352,7 +352,7 @@ public class SearchSiteInfoService {
public record SiteInfoWithContext(String domain,
boolean isSubscribed,
List<EdgeDomain> siblingDomains,
List<DbDomainQueries.DomainWithNode> siblingDomains,
int domainId,
String siteUrl,
boolean hasScreenshot,

View File

@@ -2,13 +2,24 @@
This service handles search traffic and is the service
you're most directly interacting with when visiting
[search.marginalia.nu](https://search.marginalia.nu).
[marginalia-search.com](https://marginalia-search.com).
It interprets a "human" query and translates it into a
request that gets passed into to the index service, which finds
related documents, which this service then ranks and returns
to the user.
The UI is built using [JTE templates](https://jte.gg/syntax/) and the [Jooby framework](https://jooby.io), primarily using
its MVC facilities.
When developing, it's possible to set up a mock version of the UI by running
the gradle command
```$ ./gradlew paperDoll -i```
The UI will be available at http://localhost:9999/, and has hot reloading of JTE classes
and static resources.
![image](../../../doc/diagram/search-service-map.svg)

View File

@@ -1,3 +1,4 @@
@import nu.marginalia.db.DbDomainQueries
@import nu.marginalia.model.EdgeDomain
@import nu.marginalia.search.svc.SearchSiteInfoService
@import nu.marginalia.search.svc.SearchSiteInfoService.*
@@ -94,10 +95,14 @@
</tr>
</thead>
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
@for (EdgeDomain sibling : siteInfo.siblingDomains())
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
<tr>
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.toString()}">${sibling.toString()}</a>
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
@if (!sibling.isIndexed())
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
@endif
</td>
</tr>
@endfor

View File

@@ -6,6 +6,7 @@ import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
@@ -132,8 +133,9 @@ public class MockedSearchResults {
return new SearchSiteInfoService.SiteInfoWithContext(
"www.example.com",
false,
List.of(new EdgeDomain("example.com"),
new EdgeDomain("about.example.com")
List.of(
new DbDomainQueries.DomainWithNode(new EdgeDomain("example.com"), 1),
new DbDomainQueries.DomainWithNode(new EdgeDomain("example.com"), 0)
),
14,
"https://www.example.com",

View File

@@ -0,0 +1,85 @@
package nu.marginalia.search.svc;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.test.TestMigrationLoader;
import org.junit.jupiter.api.*;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.sql.SQLException;
@Tag("slow")
@Testcontainers
class SearchAddToCrawlQueueServiceTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withNetworkAliases("mariadb");
static HikariDataSource dataSource;
private DbDomainQueries domainQueries;
private SearchAddToCrawlQueueService addToCrawlQueueService;
@BeforeEach
public void setUp() throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement()) {
stmt.executeQuery("DELETE FROM EC_DOMAIN"); // Wipe any old state from other test runs
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('known.example.com', 'example.com', -1)");
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('added.example.com', 'example.com', 0)");
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('indexed.example.com', 'example.com', 1)");
}
domainQueries = new DbDomainQueries(dataSource);
addToCrawlQueueService = new SearchAddToCrawlQueueService(domainQueries, dataSource);
}
@BeforeAll
public static void setUpAll() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
TestMigrationLoader.flywayMigration(dataSource);
}
private int getNodeAffinity(String domainName) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
{
stmt.setString(1, domainName);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
}
return -1;
}
@Test
void addToCrawlQueue() throws SQLException {
int knownId = domainQueries.getDomainId(new EdgeDomain("known.example.com"));
int addedId = domainQueries.getDomainId(new EdgeDomain("added.example.com"));
int indexedId = domainQueries.getDomainId(new EdgeDomain("indexed.example.com"));
addToCrawlQueueService.addToCrawlQueue(knownId);
addToCrawlQueueService.addToCrawlQueue(addedId);
addToCrawlQueueService.addToCrawlQueue(indexedId);
Assertions.assertEquals(0, getNodeAffinity("known.example.com"));
Assertions.assertEquals(0, getNodeAffinity("added.example.com"));
Assertions.assertEquals(1, getNodeAffinity("indexed.example.com"));
}
}

View File

@@ -146,6 +146,7 @@ public class QueryBasicInterface {
.shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold))
.shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty))
.bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight))
.disablePenalties(boolFromRequest(request, "disablePenalties", sensibleDefaults.disablePenalties))
.exportDebugData(true)
.build();
}
@@ -154,6 +155,13 @@ public class QueryBasicInterface {
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Double.parseDouble(request.queryParams(param));
}
boolean boolFromRequest(Request request, String param, boolean defaultValue) {
if (param == null)
return defaultValue;
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Boolean.parseBoolean(request.queryParams(param));
}
int intFromRequest(Request request, String param, int defaultValue) {
return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : parseInt(request.queryParams(param));
}

View File

@@ -67,6 +67,14 @@
<div class="row my-2">
<div class="col-sm-2"><label for="bm25FullWeight">BM25 Weight</label></div>
<div class="col-sm-2"><input type="text" class="form-control" id="bm25Weight" name="bm25Weight" value="{{bm25Weight}}"></div>
<div class="col-sm-2"><label for="disablePenalties">Disable Penalties</label></div>
<div class="col-sm-2">
<select class="form-select" id="disablePenalties" name="disablePenalties">
<option value="FALSE" {{#unless disablePenalties}}selected{{/unless}}>FALSE</option>
<option value="TRUE" {{#if disablePenalties}}selected{{/if}}>TRUE</option>
</select>
</div>
</div>
{{/with}}

View File

@@ -1,6 +1,3 @@
## This is a token file for automatic deployment
A master HEAD tagged with deploy-core*, deploy-executor*, or deploy-index* will trigger a commit.
2024-12-19-00002: Test deployment of executor
2024-12-19-00001: Test deployment of executor
2025-01-07: Deploy executor.