1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

6 Commits

Author SHA1 Message Date
Viktor Lofgren
bc2c2061f2 (index-client) Clean up index client code
This should have the rpc stream reception be performed in parallel in separate threads, rather blocking sequentially in the main thread, hopefully giving a slight performance boost.
2025-01-10 15:14:42 +01:00
Viktor Lofgren
1c7f5a31a5 (search) Further reduce the number of db queries by adding more caching to DbDomainQueries. 2025-01-10 14:17:29 +01:00
Viktor Lofgren
59a8ea60f7 (search) Further reduce the number of db queries by adding more caching to DbDomainQueries. 2025-01-10 14:15:22 +01:00
Viktor Lofgren
aa9b1244ea (search) Reduce the number of db queries a bit by caching data that doesn't change too often 2025-01-10 13:56:04 +01:00
Viktor Lofgren
2d17233366 (search) Reduce the number of db queries a bit by caching data that doesn't change too often 2025-01-10 13:53:56 +01:00
Viktor Lofgren
b245cc9f38 (search) Reduce the number of db queries a bit by caching data that doesn't change too often 2025-01-10 13:46:19 +01:00
5 changed files with 139 additions and 225 deletions

View File

@@ -20,7 +20,10 @@ public class DbDomainQueries {
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class); private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
@Inject @Inject
public DbDomainQueries(HikariDataSource dataSource) public DbDomainQueries(HikariDataSource dataSource)
@@ -30,16 +33,21 @@ public class DbDomainQueries {
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException { public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
try (var connection = dataSource.getConnection()) { try {
return domainIdCache.get(domain, () -> { return domainIdCache.get(domain, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString()); stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
if (rsp.next()) { if (rsp.next()) {
return rsp.getInt(1); return rsp.getInt(1);
} }
} }
catch (SQLException ex) {
throw new RuntimeException(ex);
}
throw new NoSuchElementException(); throw new NoSuchElementException();
}); });
} }
@@ -49,9 +57,6 @@ public class DbDomainQueries {
catch (ExecutionException ex) { catch (ExecutionException ex) {
throw new RuntimeException(ex.getCause()); throw new RuntimeException(ex.getCause());
} }
catch (SQLException ex) {
throw new RuntimeException(ex);
}
} }
public OptionalInt tryGetDomainId(EdgeDomain domain) { public OptionalInt tryGetDomainId(EdgeDomain domain) {
@@ -84,31 +89,38 @@ public class DbDomainQueries {
} }
public Optional<EdgeDomain> getDomain(int id) { public Optional<EdgeDomain> getDomain(int id) {
try (var connection = dataSource.getConnection()) {
EdgeDomain existing = domainNameCache.getIfPresent(id);
if (existing != null) {
return Optional.of(existing);
}
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id); stmt.setInt(1, id);
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
if (rsp.next()) { if (rsp.next()) {
return Optional.of(new EdgeDomain(rsp.getString(1))); var val = new EdgeDomain(rsp.getString(1));
domainNameCache.put(id, val);
return Optional.of(val);
} }
return Optional.empty(); return Optional.empty();
} }
} }
catch (UncheckedExecutionException ex) {
throw new RuntimeException(ex.getCause());
}
catch (SQLException ex) { catch (SQLException ex) {
throw new RuntimeException(ex); throw new RuntimeException(ex);
} }
} }
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) { public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) throws ExecutionException {
String topDomain = domain.topDomain;
return siblingsCache.get(topDomain, () -> {
List<DomainWithNode> ret = new ArrayList<>(); List<DomainWithNode> ret = new ArrayList<>();
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) { var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
stmt.setString(1, domain.topDomain); stmt.setString(1, topDomain);
stmt.setInt(2, cnt); stmt.setInt(2, cnt);
var rs = stmt.executeQuery(); var rs = stmt.executeQuery();
@@ -123,8 +135,9 @@ public class DbDomainQueries {
} catch (SQLException e) { } catch (SQLException e) {
logger.error("Failed to get domain neighbors"); logger.error("Failed to get domain neighbors");
} }
return ret; return ret;
});
} }
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) { public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {

View File

@@ -1,118 +0,0 @@
package nu.marginalia.db;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.OptionalInt;
/** Class used in exporting data. This is intended to be used for a brief time
* and then discarded, not kept around as a service.
*/
public class DbDomainStatsExportMultitool implements AutoCloseable {
private final Connection connection;
private final int nodeId;
private final PreparedStatement knownUrlsQuery;
private final PreparedStatement visitedUrlsQuery;
private final PreparedStatement goodUrlsQuery;
private final PreparedStatement domainNameToId;
private final PreparedStatement allDomainsQuery;
private final PreparedStatement crawlQueueDomains;
private final PreparedStatement indexedDomainsQuery;
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
this.connection = dataSource.getConnection();
this.nodeId = nodeId;
knownUrlsQuery = connection.prepareStatement("""
SELECT KNOWN_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
visitedUrlsQuery = connection.prepareStatement("""
SELECT VISITED_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
goodUrlsQuery = connection.prepareStatement("""
SELECT GOOD_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
domainNameToId = connection.prepareStatement("""
SELECT ID
FROM EC_DOMAIN
WHERE DOMAIN_NAME=?
""");
allDomainsQuery = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
""");
crawlQueueDomains = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM CRAWL_QUEUE
""");
indexedDomainsQuery = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
WHERE INDEXED > 0
""");
}
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
return executeNameToIntQuery(domainName, visitedUrlsQuery);
}
public OptionalInt getDomainId(String domainName) throws SQLException {
return executeNameToIntQuery(domainName, domainNameToId);
}
public List<String> getCrawlQueueDomains() throws SQLException {
return executeListQuery(crawlQueueDomains, 100);
}
public List<String> getAllIndexedDomains() throws SQLException {
return executeListQuery(indexedDomainsQuery, 100_000);
}
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
throws SQLException {
statement.setString(1, domainName);
var rs = statement.executeQuery();
if (rs.next()) {
return OptionalInt.of(rs.getInt(1));
}
return OptionalInt.empty();
}
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
List<String> ret = new ArrayList<>(sizeHint);
var rs = statement.executeQuery();
while (rs.next()) {
ret.add(rs.getString(1));
}
return ret;
}
@Override
public void close() throws SQLException {
knownUrlsQuery.close();
goodUrlsQuery.close();
visitedUrlsQuery.close();
allDomainsQuery.close();
crawlQueueDomains.close();
domainNameToId.close();
connection.close();
}
}

View File

@@ -16,20 +16,18 @@ import org.slf4j.LoggerFactory;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import static java.lang.Math.clamp;
@Singleton @Singleton
public class IndexClient { public class IndexClient {
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class); private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool; private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
private final DomainBlacklistImpl blacklist; private final DomainBlacklistImpl blacklist;
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor(); private static final ExecutorService executor = Executors.newCachedThreadPool();
@Inject @Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) { public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
@@ -51,40 +49,31 @@ public class IndexClient {
/** Execute a query on the index partitions and return the combined results. */ /** Execute a query on the index partitions and return the combined results. */
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) { public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
.async(executor)
.runEach(indexRequest);
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal(); final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
final int resultsUpperBound = requestedMaxResults * channelPool.getNumNodes();
List<RpcDecoratedResultItem> results = new ArrayList<>(resultsUpperBound); AtomicInteger totalNumResults = new AtomicInteger(0);
for (var future : futures) { List<RpcDecoratedResultItem> results =
try { channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
future.get().forEachRemaining(results::add); .async(executor)
} .runEach(indexRequest)
catch (Exception e) { .stream()
logger.error("Downstream exception", e); .map(future -> future.thenApply(iterator -> {
} List<RpcDecoratedResultItem> ret = new ArrayList<>(requestedMaxResults);
} iterator.forEachRemaining(ret::add);
totalNumResults.addAndGet(ret.size());
return ret;
}))
.map(CompletableFuture::join)
.flatMap(List::stream)
.filter(item -> !isBlacklisted(item))
.sorted(comparator)
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
.limit(pagination.pageSize)
.toList();
// Sort the results by ranking score and remove blacklisted domains return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
results.sort(comparator);
results.removeIf(this::isBlacklisted);
int numReceivedResults = results.size();
// pagination is typically 1-indexed, so we need to adjust the start and end indices
int indexStart = (pagination.page - 1) * pagination.pageSize;
int indexEnd = (pagination.page) * pagination.pageSize;
results = results.subList(
clamp(indexStart, 0, Math.max(0, results.size() - 1)), // from is inclusive, so subtract 1 from size()
clamp(indexEnd, 0, results.size()));
return new AggregateQueryResponse(results, pagination.page(), numReceivedResults);
} }
private boolean isBlacklisted(RpcDecoratedResultItem item) { private boolean isBlacklisted(RpcDecoratedResultItem item) {

View File

@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.*;
import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.function.Supplier; import java.util.function.Supplier;
@@ -67,8 +68,12 @@ public class SearchSiteInfoService {
this.screenshotService = screenshotService; this.screenshotService = screenshotService;
this.dataSource = dataSource; this.dataSource = dataSource;
this.searchSiteSubscriptions = searchSiteSubscriptions; this.searchSiteSubscriptions = searchSiteSubscriptions;
Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
} }
private volatile SiteOverviewModel cachedOverviewModel = new SiteOverviewModel(List.of());
@GET @GET
@Path("/site") @Path("/site")
public ModelAndView<?> handleOverview(@QueryParam String domain) { public ModelAndView<?> handleOverview(@QueryParam String domain) {
@@ -77,23 +82,48 @@ public class SearchSiteInfoService {
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain)); return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
} }
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
var rs = stmt.executeQuery();
while (rs.next()) {
domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
}
}
catch (SQLException ex) {
throw new RuntimeException();
}
return new MapModelAndView("siteinfo/start.jte", return new MapModelAndView("siteinfo/start.jte",
Map.of("navbar", NavbarModel.SITEINFO, Map.of("navbar", NavbarModel.SITEINFO,
"model", new SiteOverviewModel(domains))); "model", cachedOverviewModel));
}
private void modelUpdater() {
while (!Thread.interrupted()) {
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
// This query can be quite expensive, so we can't run it on demand
// for every request. Instead, we run it every 15 minutes and cache
// the result.
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT DOMAIN_NAME, DISCOVER_DATE
FROM EC_DOMAIN
WHERE NODE_AFFINITY = 0
ORDER BY ID DESC
LIMIT 10
"""))
{
var rs = stmt.executeQuery();
while (rs.next()) {
domains.add(new SiteOverviewModel.DiscoveredDomain(
rs.getString("DOMAIN_NAME"),
rs.getString("DISCOVER_DATE"))
);
}
} catch (SQLException ex) {
logger.warn("Failed to get recently added domains: {}", ex.getMessage());
}
cachedOverviewModel = new SiteOverviewModel(domains);
try {
TimeUnit.MINUTES.sleep(15);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
} }
public record SiteOverviewModel(List<DiscoveredDomain> domains) { public record SiteOverviewModel(List<DiscoveredDomain> domains) {
@@ -107,7 +137,7 @@ public class SearchSiteInfoService {
@PathParam String domainName, @PathParam String domainName,
@QueryParam String view, @QueryParam String view,
@QueryParam Integer page @QueryParam Integer page
) throws SQLException { ) throws SQLException, ExecutionException {
if (null == domainName || domainName.isBlank()) { if (null == domainName || domainName.isBlank()) {
return null; return null;
@@ -193,7 +223,7 @@ public class SearchSiteInfoService {
); );
} }
private SiteInfoWithContext listInfo(Context context, String domainName) { private SiteInfoWithContext listInfo(Context context, String domainName) throws ExecutionException {
var domain = new EdgeDomain(domainName); var domain = new EdgeDomain(domainName);
final int domainId = domainQueries.tryGetDomainId(domain).orElse(-1); final int domainId = domainQueries.tryGetDomainId(domain).orElse(-1);

View File

@@ -1,5 +1,4 @@
@import nu.marginalia.db.DbDomainQueries @import nu.marginalia.db.DbDomainQueries
@import nu.marginalia.model.EdgeDomain
@import nu.marginalia.search.svc.SearchSiteInfoService @import nu.marginalia.search.svc.SearchSiteInfoService
@import nu.marginalia.search.svc.SearchSiteInfoService.* @import nu.marginalia.search.svc.SearchSiteInfoService.*
@import nu.marginalia.search.model.UrlDetails @import nu.marginalia.search.model.UrlDetails
@@ -81,35 +80,6 @@
@endif @endif
@if (!siteInfo.siblingDomains().isEmpty())
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
<i class="fas fa-globe"></i>
<span>Related Subdomains</span>
</div>
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
<thead>
<tr class="bg-gray-50 dark:bg-gray-700">
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
</tr>
</thead>
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
<tr>
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
@if (!sibling.isIndexed())
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
@endif
</td>
</tr>
@endfor
</tbody>
</table>
@endif
@if (siteInfo.domainInformation().isUnknownDomain()) @if (siteInfo.domainInformation().isUnknownDomain())
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded"> <div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
<i class="fa-regular fa-circle-question"></i> <i class="fa-regular fa-circle-question"></i>
@@ -178,6 +148,36 @@
</form> </form>
@endif @endif
@if (!siteInfo.siblingDomains().isEmpty())
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
<i class="fas fa-globe"></i>
<span>Related Subdomains</span>
</div>
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
<thead>
<tr class="bg-gray-50 dark:bg-gray-700">
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
</tr>
</thead>
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
<tr>
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
@if (!sibling.isIndexed())
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
@endif
</td>
</tr>
@endfor
</tbody>
</table>
@endif
@if (siteInfo.isKnown()) @if (siteInfo.isKnown())
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded"> <div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
<i class="fas fa-chart-simple"></i> <i class="fas fa-chart-simple"></i>