1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

5 Commits

Author SHA1 Message Date
Viktor Lofgren
b7f0a2a98e (search-service) Fix metrics for errors and request times
This was previously in place, but broke during the jooby migration.
2025-01-08 14:10:43 +01:00
Viktor Lofgren
5fb76b2e79 (search-service) Fix metrics for errors and request times
This was previously in place, but broke during the jooby migration.
2025-01-08 14:06:03 +01:00
Viktor Lofgren
ad8c97f342 (search-service) Begin replacement of the crawl queue mechanism with node_affinity flagging
Previously a special db table was used to hold domains slated for crawling, but this is deprecated, and instead now each domain has a node_affinity flag that decides its indexing state, where a value of -1 indicates it shouldn't be crawled, a value of 0 means it's slated for crawling by the next index partition to be crawled, and a positive value means it's assigned to an index partition.

The change set also adds a test case validating the modified behavior.
2025-01-08 13:25:56 +01:00
Viktor Lofgren
dc1b6373eb (search-service) Clean up readme 2025-01-08 13:04:39 +01:00
Viktor Lofgren
983d6d067c (search-service) Add indexing indicator to sibling domains listing 2025-01-08 12:58:34 +01:00
10 changed files with 152 additions and 21 deletions

View File

@@ -103,11 +103,11 @@ public class DbDomainQueries {
}
}
public List<EdgeDomain> otherSubdomains(EdgeDomain domain, int cnt) {
List<EdgeDomain> ret = new ArrayList<>();
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) {
List<DomainWithNode> ret = new ArrayList<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
stmt.setString(1, domain.topDomain);
stmt.setInt(2, cnt);
@@ -118,7 +118,7 @@ public class DbDomainQueries {
if (sibling.equals(domain))
continue;
ret.add(sibling);
ret.add(new DomainWithNode(sibling, rs.getInt(2)));
}
} catch (SQLException e) {
logger.error("Failed to get domain neighbors");
@@ -126,4 +126,10 @@ public class DbDomainQueries {
return ret;
}
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
public boolean isIndexed() {
return nodeAffinity > 0;
}
}
}

View File

@@ -67,8 +67,7 @@ public class ResultRankingParameters {
this.exportDebugData = exportDebugData;
}
public static ResultRankingParameters sensibleDefaults() {
return builder()
private static final ResultRankingParameters _sensibleDefaults = builder()
.bm25Params(new Bm25Parameters(1.2, 0.5))
.shortDocumentThreshold(2000)
.shortDocumentPenalty(2.)
@@ -85,6 +84,9 @@ public class ResultRankingParameters {
.exportDebugData(false)
.disablePenalties(false)
.build();
public static ResultRankingParameters sensibleDefaults() {
return _sensibleDefaults;
}
public static ResultRankingParametersBuilder builder() {

View File

@@ -1,15 +1,14 @@
package nu.marginalia.search;
import com.google.inject.Inject;
import io.jooby.Context;
import io.jooby.Jooby;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.svc.*;
import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.JoobyService;
import nu.marginalia.service.server.StaticResources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -34,8 +33,6 @@ public class SearchService extends JoobyService {
@Inject
public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
StaticResources staticResources,
SearchFrontPageService frontPageService,
SearchAddToCrawlQueueService addToCrawlQueueService,
SearchSiteSubscriptionService siteSubscriptionService,
@@ -62,7 +59,25 @@ public class SearchService extends JoobyService {
public void startJooby(Jooby jooby) {
super.startJooby(jooby);
final String startTimeAttribute = "start-time";
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
jooby.before((Context ctx) -> {
ctx.setAttribute(startTimeAttribute, System.nanoTime());
});
jooby.after((Context ctx, Object result, Throwable failure) -> {
if (failure != null) {
wmsa_search_service_error_count.labels(ctx.getRoute().getPattern(), ctx.getMethod()).inc();
}
else {
Long startTime = ctx.getAttribute(startTimeAttribute);
if (startTime != null) {
wmsa_search_service_request_time.labels(ctx.getRoute().getPattern(), ctx.getMethod())
.observe((System.nanoTime() - startTime) / 1e9);
}
}
});
}

View File

@@ -47,18 +47,23 @@ public class SearchAddToCrawlQueueService {
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domainName));
}
private void addToCrawlQueue(int id) throws SQLException {
/** Mark a domain for crawling by setting node affinity to zero,
* unless it is already marked for crawling, then node affinity should
* be left unchanged.
* */
void addToCrawlQueue(int domainId) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
UPDATE EC_DOMAIN
SET WMSA_prod.EC_DOMAIN.NODE_AFFINITY = 0
WHERE ID=? AND WMSA_prod.EC_DOMAIN.NODE_AFFINITY < 0
""")) {
stmt.setInt(1, id);
stmt.setInt(1, domainId);
stmt.executeUpdate();
}
}
private String getDomainName(int id) {
String getDomainName(int id) {
var domain = domainQueries.getDomain(id);
if (domain.isEmpty())
throw new IllegalArgumentException();

View File

@@ -352,7 +352,7 @@ public class SearchSiteInfoService {
public record SiteInfoWithContext(String domain,
boolean isSubscribed,
List<EdgeDomain> siblingDomains,
List<DbDomainQueries.DomainWithNode> siblingDomains,
int domainId,
String siteUrl,
boolean hasScreenshot,

View File

@@ -2,13 +2,24 @@
This service handles search traffic and is the service
you're most directly interacting with when visiting
[search.marginalia.nu](https://search.marginalia.nu).
[marginalia-search.com](https://marginalia-search.com).
It interprets a "human" query and translates it into a
request that gets passed into to the index service, which finds
related documents, which this service then ranks and returns
to the user.
The UI is built using [JTE templates](https://jte.gg/syntax/) and the [Jooby framework](https://jooby.io), primarily using
its MVC facilities.
When developing, it's possible to set up a mock version of the UI by running
the gradle command
```$ ./gradlew paperDoll -i```
The UI will be available at http://localhost:9999/, and has hot reloading of JTE classes
and static resources.
![image](../../../doc/diagram/search-service-map.svg)

View File

@@ -1,3 +1,4 @@
@import nu.marginalia.db.DbDomainQueries
@import nu.marginalia.model.EdgeDomain
@import nu.marginalia.search.svc.SearchSiteInfoService
@import nu.marginalia.search.svc.SearchSiteInfoService.*
@@ -94,10 +95,14 @@
</tr>
</thead>
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
@for (EdgeDomain sibling : siteInfo.siblingDomains())
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
<tr>
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.toString()}">${sibling.toString()}</a>
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
@if (!sibling.isIndexed())
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
@endif
</td>
</tr>
@endfor

View File

@@ -6,6 +6,7 @@ import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
@@ -132,8 +133,9 @@ public class MockedSearchResults {
return new SearchSiteInfoService.SiteInfoWithContext(
"www.example.com",
false,
List.of(new EdgeDomain("example.com"),
new EdgeDomain("about.example.com")
List.of(
new DbDomainQueries.DomainWithNode(new EdgeDomain("example.com"), 1),
new DbDomainQueries.DomainWithNode(new EdgeDomain("example.com"), 0)
),
14,
"https://www.example.com",

View File

@@ -0,0 +1,85 @@
package nu.marginalia.search.svc;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.test.TestMigrationLoader;
import org.junit.jupiter.api.*;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.sql.SQLException;
@Tag("slow")
@Testcontainers
class SearchAddToCrawlQueueServiceTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withNetworkAliases("mariadb");
static HikariDataSource dataSource;
private DbDomainQueries domainQueries;
private SearchAddToCrawlQueueService addToCrawlQueueService;
@BeforeEach
public void setUp() throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement()) {
stmt.executeQuery("DELETE FROM EC_DOMAIN"); // Wipe any old state from other test runs
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('known.example.com', 'example.com', -1)");
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('added.example.com', 'example.com', 0)");
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('indexed.example.com', 'example.com', 1)");
}
domainQueries = new DbDomainQueries(dataSource);
addToCrawlQueueService = new SearchAddToCrawlQueueService(domainQueries, dataSource);
}
@BeforeAll
public static void setUpAll() {
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
TestMigrationLoader.flywayMigration(dataSource);
}
private int getNodeAffinity(String domainName) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
{
stmt.setString(1, domainName);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
}
return -1;
}
@Test
void addToCrawlQueue() throws SQLException {
int knownId = domainQueries.getDomainId(new EdgeDomain("known.example.com"));
int addedId = domainQueries.getDomainId(new EdgeDomain("added.example.com"));
int indexedId = domainQueries.getDomainId(new EdgeDomain("indexed.example.com"));
addToCrawlQueueService.addToCrawlQueue(knownId);
addToCrawlQueueService.addToCrawlQueue(addedId);
addToCrawlQueueService.addToCrawlQueue(indexedId);
Assertions.assertEquals(0, getNodeAffinity("known.example.com"));
Assertions.assertEquals(0, getNodeAffinity("added.example.com"));
Assertions.assertEquals(1, getNodeAffinity("indexed.example.com"));
}
}