mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
7 Commits
deploy-001
...
deploy-002
Author | SHA1 | Date | |
---|---|---|---|
|
594df64b20 | ||
|
78eb1417a7 | ||
|
67edc8f90d | ||
|
5f576b7d0c | ||
|
0b65164f60 | ||
|
9be477de33 | ||
|
710af4999a |
@@ -32,7 +32,6 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
|||||||
private final Function<InstanceAddress, ManagedChannel> channelConstructor;
|
private final Function<InstanceAddress, ManagedChannel> channelConstructor;
|
||||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||||
|
|
||||||
|
|
||||||
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||||
ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
|
ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
|
||||||
Function<InstanceAddress, ManagedChannel> channelConstructor,
|
Function<InstanceAddress, ManagedChannel> channelConstructor,
|
||||||
@@ -48,8 +47,6 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
|||||||
serviceRegistryIf.registerMonitor(this);
|
serviceRegistryIf.registerMonitor(this);
|
||||||
|
|
||||||
onChange();
|
onChange();
|
||||||
|
|
||||||
awaitChannel(Duration.ofSeconds(5));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -101,6 +101,7 @@ message RpcSimilarDomain {
|
|||||||
bool active = 6;
|
bool active = 6;
|
||||||
bool screenshot = 7;
|
bool screenshot = 7;
|
||||||
LINK_TYPE linkType = 8;
|
LINK_TYPE linkType = 8;
|
||||||
|
bool feed = 9;
|
||||||
|
|
||||||
enum LINK_TYPE {
|
enum LINK_TYPE {
|
||||||
BACKWARD = 0;
|
BACKWARD = 0;
|
||||||
|
@@ -9,6 +9,7 @@ import gnu.trove.map.hash.TIntIntHashMap;
|
|||||||
import gnu.trove.set.TIntSet;
|
import gnu.trove.set.TIntSet;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
|
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.api.domains.RpcSimilarDomain;
|
import nu.marginalia.api.domains.RpcSimilarDomain;
|
||||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||||
@@ -17,10 +18,14 @@ import org.roaringbitmap.RoaringBitmap;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.DriverManager;
|
||||||
import java.sql.ResultSet;
|
import java.sql.ResultSet;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.ScheduledExecutorService;
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
@@ -32,12 +37,13 @@ public class SimilarDomainsService {
|
|||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final AggregateLinkGraphClient linkGraphClient;
|
private final AggregateLinkGraphClient linkGraphClient;
|
||||||
|
|
||||||
private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
|
private final TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
|
||||||
private volatile int[] domainIdxToId;
|
private volatile int[] domainIdxToId;
|
||||||
|
|
||||||
public volatile Int2DoubleArrayMap[] relatedDomains;
|
public volatile Int2DoubleArrayMap[] relatedDomains;
|
||||||
public volatile TIntList[] domainNeighbors = null;
|
public volatile TIntList[] domainNeighbors = null;
|
||||||
public volatile RoaringBitmap screenshotDomains = null;
|
public volatile RoaringBitmap screenshotDomains = null;
|
||||||
|
public volatile RoaringBitmap feedDomains = null;
|
||||||
public volatile RoaringBitmap activeDomains = null;
|
public volatile RoaringBitmap activeDomains = null;
|
||||||
public volatile RoaringBitmap indexedDomains = null;
|
public volatile RoaringBitmap indexedDomains = null;
|
||||||
public volatile TIntDoubleHashMap domainRanks = null;
|
public volatile TIntDoubleHashMap domainRanks = null;
|
||||||
@@ -82,6 +88,7 @@ public class SimilarDomainsService {
|
|||||||
domainNames = new String[domainIdToIdx.size()];
|
domainNames = new String[domainIdToIdx.size()];
|
||||||
domainNeighbors = new TIntList[domainIdToIdx.size()];
|
domainNeighbors = new TIntList[domainIdToIdx.size()];
|
||||||
screenshotDomains = new RoaringBitmap();
|
screenshotDomains = new RoaringBitmap();
|
||||||
|
feedDomains = new RoaringBitmap();
|
||||||
activeDomains = new RoaringBitmap();
|
activeDomains = new RoaringBitmap();
|
||||||
indexedDomains = new RoaringBitmap();
|
indexedDomains = new RoaringBitmap();
|
||||||
relatedDomains = new Int2DoubleArrayMap[domainIdToIdx.size()];
|
relatedDomains = new Int2DoubleArrayMap[domainIdToIdx.size()];
|
||||||
@@ -145,10 +152,12 @@ public class SimilarDomainsService {
|
|||||||
activeDomains.add(idx);
|
activeDomains.add(idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
updateScreenshotInfo();
|
|
||||||
|
|
||||||
logger.info("Loaded {} domains", domainRanks.size());
|
logger.info("Loaded {} domains", domainRanks.size());
|
||||||
isReady = true;
|
isReady = true;
|
||||||
|
|
||||||
|
// We can defer these as they only populate a roaringbitmap, and will degrade gracefully when not complete
|
||||||
|
updateScreenshotInfo();
|
||||||
|
updateFeedInfo();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (SQLException throwables) {
|
catch (SQLException throwables) {
|
||||||
@@ -156,6 +165,42 @@ public class SimilarDomainsService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void updateFeedInfo() {
|
||||||
|
Set<String> feedsDomainNames = new HashSet<>(500_000);
|
||||||
|
Path readerDbPath = WmsaHome.getDataPath().resolve("rss-feeds.db").toAbsolutePath();
|
||||||
|
String dbUrl = "jdbc:sqlite:" + readerDbPath;
|
||||||
|
|
||||||
|
logger.info("Opening feed db at " + dbUrl);
|
||||||
|
|
||||||
|
try (var conn = DriverManager.getConnection(dbUrl);
|
||||||
|
var stmt = conn.createStatement()) {
|
||||||
|
var rs = stmt.executeQuery("""
|
||||||
|
select
|
||||||
|
json_extract(feed, '$.domain') as domain
|
||||||
|
from feed
|
||||||
|
where json_array_length(feed, '$.items') > 0
|
||||||
|
""");
|
||||||
|
while (rs.next()) {
|
||||||
|
feedsDomainNames.add(rs.getString(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to read RSS feed items", ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int idx = 0; idx < domainNames.length; idx++) {
|
||||||
|
String name = domainNames[idx];
|
||||||
|
if (name == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (feedsDomainNames.contains(name)) {
|
||||||
|
feedDomains.add(idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private void updateScreenshotInfo() {
|
private void updateScreenshotInfo() {
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
try (var stmt = connection.createStatement()) {
|
try (var stmt = connection.createStatement()) {
|
||||||
@@ -254,6 +299,7 @@ public class SimilarDomainsService {
|
|||||||
.setIndexed(indexedDomains.contains(idx))
|
.setIndexed(indexedDomains.contains(idx))
|
||||||
.setActive(activeDomains.contains(idx))
|
.setActive(activeDomains.contains(idx))
|
||||||
.setScreenshot(screenshotDomains.contains(idx))
|
.setScreenshot(screenshotDomains.contains(idx))
|
||||||
|
.setFeed(feedDomains.contains(idx))
|
||||||
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
|
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
|
||||||
.build());
|
.build());
|
||||||
|
|
||||||
@@ -369,6 +415,7 @@ public class SimilarDomainsService {
|
|||||||
.setIndexed(indexedDomains.contains(idx))
|
.setIndexed(indexedDomains.contains(idx))
|
||||||
.setActive(activeDomains.contains(idx))
|
.setActive(activeDomains.contains(idx))
|
||||||
.setScreenshot(screenshotDomains.contains(idx))
|
.setScreenshot(screenshotDomains.contains(idx))
|
||||||
|
.setFeed(feedDomains.contains(idx))
|
||||||
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
|
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
|
||||||
.build());
|
.build());
|
||||||
|
|
||||||
|
@@ -402,6 +402,7 @@ public class FeedFetcherService {
|
|||||||
"–", "-",
|
"–", "-",
|
||||||
"’", "'",
|
"’", "'",
|
||||||
"‘", "'",
|
"‘", "'",
|
||||||
|
""", "\"",
|
||||||
" ", ""
|
" ", ""
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@@ -10,7 +10,6 @@ public class TestXmlSanitization {
|
|||||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||||
Assertions.assertEquals(""", FeedFetcherService.sanitizeEntities("""));
|
|
||||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -23,4 +22,9 @@ public class TestXmlSanitization {
|
|||||||
public void testTranslatedHtmlEntity() {
|
public void testTranslatedHtmlEntity() {
|
||||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTranslatedHtmlEntityQuot() {
|
||||||
|
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob""));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -9,10 +9,9 @@ import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
|||||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import javax.annotation.CheckReturnValue;
|
import javax.annotation.CheckReturnValue;
|
||||||
|
import java.time.Duration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class QueryClient {
|
public class QueryClient {
|
||||||
@@ -24,13 +23,14 @@ public class QueryClient {
|
|||||||
|
|
||||||
private final GrpcSingleNodeChannelPool<QueryApiGrpc.QueryApiBlockingStub> queryApiPool;
|
private final GrpcSingleNodeChannelPool<QueryApiGrpc.QueryApiBlockingStub> queryApiPool;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public QueryClient(GrpcChannelPoolFactory channelPoolFactory) {
|
public QueryClient(GrpcChannelPoolFactory channelPoolFactory) throws InterruptedException {
|
||||||
this.queryApiPool = channelPoolFactory.createSingle(
|
this.queryApiPool = channelPoolFactory.createSingle(
|
||||||
ServiceKey.forGrpcApi(QueryApiGrpc.class, ServicePartition.any()),
|
ServiceKey.forGrpcApi(QueryApiGrpc.class, ServicePartition.any()),
|
||||||
QueryApiGrpc::newBlockingStub);
|
QueryApiGrpc::newBlockingStub);
|
||||||
|
|
||||||
|
// Hold up initialization until we have a downstream connection
|
||||||
|
this.queryApiPool.awaitChannel(Duration.ofSeconds(5));
|
||||||
}
|
}
|
||||||
|
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
|
@@ -155,16 +155,25 @@ public class QueryParser {
|
|||||||
|
|
||||||
// Remove trailing punctuation
|
// Remove trailing punctuation
|
||||||
int lastChar = str.charAt(str.length() - 1);
|
int lastChar = str.charAt(str.length() - 1);
|
||||||
if (":.,!?$'".indexOf(lastChar) >= 0)
|
if (":.,!?$'".indexOf(lastChar) >= 0) {
|
||||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr()));
|
str = str.substring(0, str.length() - 1);
|
||||||
|
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
|
||||||
|
}
|
||||||
|
|
||||||
// Remove term elements that aren't indexed by the search engine
|
// Remove term elements that aren't indexed by the search engine
|
||||||
if (str.endsWith("'s"))
|
if (str.endsWith("'s")) {
|
||||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr()));
|
str = str.substring(0, str.length() - 2);
|
||||||
if (str.endsWith("()"))
|
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
|
||||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr()));
|
}
|
||||||
if (str.startsWith("$"))
|
if (str.endsWith("()")) {
|
||||||
entity.replace(new QueryToken.LiteralTerm(str.substring(1), lt.displayStr()));
|
str = str.substring(0, str.length() - 2);
|
||||||
|
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
|
||||||
|
}
|
||||||
|
|
||||||
|
while (str.startsWith("$") || str.startsWith("_")) {
|
||||||
|
str = str.substring(1);
|
||||||
|
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
|
||||||
|
}
|
||||||
|
|
||||||
if (entity.isBlank()) {
|
if (entity.isBlank()) {
|
||||||
entity.remove();
|
entity.remove();
|
||||||
|
@@ -0,0 +1,32 @@
|
|||||||
|
package nu.marginalia.functions.searchquery.query_parser;
|
||||||
|
|
||||||
|
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
class QueryParserTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
// https://github.com/MarginaliaSearch/MarginaliaSearch/issues/140
|
||||||
|
void parse__builtin_ffs() {
|
||||||
|
QueryParser parser = new QueryParser();
|
||||||
|
var tokens = parser.parse("__builtin_ffs");
|
||||||
|
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("builtin_ffs", "__builtin_ffs")), tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void trailingParens() {
|
||||||
|
QueryParser parser = new QueryParser();
|
||||||
|
var tokens = parser.parse("strcpy()");
|
||||||
|
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("strcpy", "strcpy()")), tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void trailingQuote() {
|
||||||
|
QueryParser parser = new QueryParser();
|
||||||
|
var tokens = parser.parse("bob's");
|
||||||
|
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("bob", "bob's")), tokens);
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user