1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

11 Commits

Author SHA1 Message Date
Viktor Lofgren
b62f043910 (search) Adjust token formation rules to be more lenient to C++ and PHP code.
This addresses Issue #142
2025-01-05 20:50:27 +01:00
Viktor
9b2ceaf37c Merge pull request #141 from MarginaliaSearch/vlofgren-patch-1
Update FUNDING.yml
2025-01-05 18:40:20 +01:00
Viktor
8019c2ce18 Update FUNDING.yml 2025-01-05 18:40:06 +01:00
Viktor Lofgren
4da3563d8a (service) Clean up exceptions when requestScreengrab is not available 2025-01-04 14:45:51 +01:00
Viktor Lofgren
48d0a3089a (service) Improve logging around grpc
This change adds a marker for the gRPC-specific logging, as well as improves the clarity and meaningfulness of the log messages.
2025-01-02 20:40:53 +01:00
Viktor Lofgren
594df64b20 (domain-info) Use appropriate sqlite database when fetching feed status 2025-01-02 20:20:36 +01:00
Viktor Lofgren
78eb1417a7 (service) Only block on SingleNodeChannelPool creation in QueryClient
The code was always blocking for up to 5s while waiting for the remote end to become available, meaning some services would stall for several seconds on start-up for no sensible reason.

This should make most services start faster as a result.
2025-01-02 18:42:01 +01:00
Viktor Lofgren
67edc8f90d (domain-info) Only flag domains with rss feed items as having a feed 2025-01-02 17:41:52 +01:00
Viktor Lofgren
5f576b7d0c (query-parser) Strip leading underlines
This addresses issue #140, where __builtin_ffs gives no results.
2025-01-02 14:39:03 +01:00
Viktor Lofgren
0b65164f60 (chore) Fix broken test 2025-01-01 18:06:29 +01:00
Viktor Lofgren
9be477de33 (domain-info) Add a feed flag to domain info
This is a bit of a sketchy solution that requires both assistant services to run on the same physical machine.
2025-01-01 18:02:33 +01:00
15 changed files with 175 additions and 30 deletions

1
.github/FUNDING.yml vendored
View File

@@ -1,5 +1,6 @@
# These are supported funding model platforms # These are supported funding model platforms
polar: marginalia-search
github: MarginaliaSearch github: MarginaliaSearch
patreon: marginalia_nu patreon: marginalia_nu
open_collective: # Replace with a single Open Collective username open_collective: # Replace with a single Open Collective username

View File

@@ -7,8 +7,6 @@ import nu.marginalia.service.discovery.property.PartitionTraits;
import nu.marginalia.service.discovery.property.ServiceEndpoint; import nu.marginalia.service.discovery.property.ServiceEndpoint;
import nu.marginalia.service.discovery.property.ServiceKey; import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List; import java.util.List;
import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletableFuture;
@@ -24,7 +22,7 @@ import java.util.function.Function;
public class GrpcMultiNodeChannelPool<STUB> { public class GrpcMultiNodeChannelPool<STUB> {
private final ConcurrentHashMap<Integer, GrpcSingleNodeChannelPool<STUB>> pools = private final ConcurrentHashMap<Integer, GrpcSingleNodeChannelPool<STUB>> pools =
new ConcurrentHashMap<>(); new ConcurrentHashMap<>();
private static final Logger logger = LoggerFactory.getLogger(GrpcMultiNodeChannelPool.class);
private final ServiceRegistryIf serviceRegistryIf; private final ServiceRegistryIf serviceRegistryIf;
private final ServiceKey<? extends PartitionTraits.Multicast> serviceKey; private final ServiceKey<? extends PartitionTraits.Multicast> serviceKey;
private final Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor; private final Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor;

View File

@@ -10,6 +10,8 @@ import nu.marginalia.service.discovery.property.ServiceKey;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.*;
@@ -26,13 +28,13 @@ import java.util.function.Function;
public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor { public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
private final Map<InstanceAddress, ConnectionHolder> channels = new ConcurrentHashMap<>(); private final Map<InstanceAddress, ConnectionHolder> channels = new ConcurrentHashMap<>();
private final Marker grpcMarker = MarkerFactory.getMarker("GRPC");
private static final Logger logger = LoggerFactory.getLogger(GrpcSingleNodeChannelPool.class); private static final Logger logger = LoggerFactory.getLogger(GrpcSingleNodeChannelPool.class);
private final ServiceRegistryIf serviceRegistryIf; private final ServiceRegistryIf serviceRegistryIf;
private final Function<InstanceAddress, ManagedChannel> channelConstructor; private final Function<InstanceAddress, ManagedChannel> channelConstructor;
private final Function<ManagedChannel, STUB> stubConstructor; private final Function<ManagedChannel, STUB> stubConstructor;
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf, public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
ServiceKey<? extends PartitionTraits.Unicast> serviceKey, ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
Function<InstanceAddress, ManagedChannel> channelConstructor, Function<InstanceAddress, ManagedChannel> channelConstructor,
@@ -48,8 +50,6 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
serviceRegistryIf.registerMonitor(this); serviceRegistryIf.registerMonitor(this);
onChange(); onChange();
awaitChannel(Duration.ofSeconds(5));
} }
@@ -62,10 +62,10 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
for (var route : Sets.symmetricDifference(oldRoutes, newRoutes)) { for (var route : Sets.symmetricDifference(oldRoutes, newRoutes)) {
ConnectionHolder oldChannel; ConnectionHolder oldChannel;
if (newRoutes.contains(route)) { if (newRoutes.contains(route)) {
logger.info("Adding route {}", route); logger.info(grpcMarker, "Adding route {} => {}", serviceKey, route);
oldChannel = channels.put(route, new ConnectionHolder(route)); oldChannel = channels.put(route, new ConnectionHolder(route));
} else { } else {
logger.info("Expelling route {}", route); logger.info(grpcMarker, "Expelling route {} => {}", serviceKey, route);
oldChannel = channels.remove(route); oldChannel = channels.remove(route);
} }
if (oldChannel != null) { if (oldChannel != null) {
@@ -103,7 +103,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
} }
try { try {
logger.info("Creating channel for {}:{}", serviceKey, address); logger.info(grpcMarker, "Creating channel for {} => {}", serviceKey, address);
value = channelConstructor.apply(address); value = channelConstructor.apply(address);
if (channel.compareAndSet(null, value)) { if (channel.compareAndSet(null, value)) {
return value; return value;
@@ -114,7 +114,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
} }
} }
catch (Exception e) { catch (Exception e) {
logger.error("Failed to get channel for " + address, e); logger.error(grpcMarker, "Failed to get channel for " + address, e);
return null; return null;
} }
} }
@@ -206,7 +206,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
} }
for (var e : exceptions) { for (var e : exceptions) {
logger.error("Failed to call service {}", serviceKey, e); logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
} }
throw new ServiceNotAvailableException(serviceKey); throw new ServiceNotAvailableException(serviceKey);

View File

@@ -4,6 +4,11 @@ import nu.marginalia.service.discovery.property.ServiceKey;
public class ServiceNotAvailableException extends RuntimeException { public class ServiceNotAvailableException extends RuntimeException {
public ServiceNotAvailableException(ServiceKey<?> key) { public ServiceNotAvailableException(ServiceKey<?> key) {
super("Service " + key + " not available"); super(key.toString());
}
@Override
public StackTraceElement[] getStackTrace() { // Suppress stack trace
return new StackTraceElement[0];
} }
} }

View File

@@ -48,5 +48,10 @@ public record ServiceEndpoint(String host, int port) {
public int port() { public int port() {
return endpoint.port(); return endpoint.port();
} }
@Override
public String toString() {
return endpoint().host() + ":" + endpoint.port() + " [" + instance + "]";
}
} }
} }

View File

@@ -48,6 +48,19 @@ public sealed interface ServiceKey<P extends ServicePartition> {
{ {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public String toString() {
final String shortName;
int periodIndex = name.lastIndexOf('.');
if (periodIndex >= 0) shortName = name.substring(periodIndex+1);
else shortName = name;
return "rest:" + shortName;
}
} }
record Grpc<P extends ServicePartition>(String name, P partition) implements ServiceKey<P> { record Grpc<P extends ServicePartition>(String name, P partition) implements ServiceKey<P> {
public String baseName() { public String baseName() {
@@ -64,6 +77,18 @@ public sealed interface ServiceKey<P extends ServicePartition> {
{ {
return new Grpc<>(name, partition); return new Grpc<>(name, partition);
} }
@Override
public String toString() {
final String shortName;
int periodIndex = name.lastIndexOf('.');
if (periodIndex >= 0) shortName = name.substring(periodIndex+1);
else shortName = name;
return "grpc:" + shortName + "[" + partition.identifier() + "]";
}
} }
} }

View File

@@ -101,6 +101,7 @@ message RpcSimilarDomain {
bool active = 6; bool active = 6;
bool screenshot = 7; bool screenshot = 7;
LINK_TYPE linkType = 8; LINK_TYPE linkType = 8;
bool feed = 9;
enum LINK_TYPE { enum LINK_TYPE {
BACKWARD = 0; BACKWARD = 0;

View File

@@ -9,6 +9,7 @@ import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.set.TIntSet; import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap; import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.domains.RpcSimilarDomain; import nu.marginalia.api.domains.RpcSimilarDomain;
import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
@@ -17,10 +18,14 @@ import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.nio.file.Path;
import java.sql.DriverManager;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@@ -32,12 +37,13 @@ public class SimilarDomainsService {
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private final AggregateLinkGraphClient linkGraphClient; private final AggregateLinkGraphClient linkGraphClient;
private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000); private final TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
private volatile int[] domainIdxToId; private volatile int[] domainIdxToId;
public volatile Int2DoubleArrayMap[] relatedDomains; public volatile Int2DoubleArrayMap[] relatedDomains;
public volatile TIntList[] domainNeighbors = null; public volatile TIntList[] domainNeighbors = null;
public volatile RoaringBitmap screenshotDomains = null; public volatile RoaringBitmap screenshotDomains = null;
public volatile RoaringBitmap feedDomains = null;
public volatile RoaringBitmap activeDomains = null; public volatile RoaringBitmap activeDomains = null;
public volatile RoaringBitmap indexedDomains = null; public volatile RoaringBitmap indexedDomains = null;
public volatile TIntDoubleHashMap domainRanks = null; public volatile TIntDoubleHashMap domainRanks = null;
@@ -82,6 +88,7 @@ public class SimilarDomainsService {
domainNames = new String[domainIdToIdx.size()]; domainNames = new String[domainIdToIdx.size()];
domainNeighbors = new TIntList[domainIdToIdx.size()]; domainNeighbors = new TIntList[domainIdToIdx.size()];
screenshotDomains = new RoaringBitmap(); screenshotDomains = new RoaringBitmap();
feedDomains = new RoaringBitmap();
activeDomains = new RoaringBitmap(); activeDomains = new RoaringBitmap();
indexedDomains = new RoaringBitmap(); indexedDomains = new RoaringBitmap();
relatedDomains = new Int2DoubleArrayMap[domainIdToIdx.size()]; relatedDomains = new Int2DoubleArrayMap[domainIdToIdx.size()];
@@ -145,10 +152,12 @@ public class SimilarDomainsService {
activeDomains.add(idx); activeDomains.add(idx);
} }
updateScreenshotInfo();
logger.info("Loaded {} domains", domainRanks.size()); logger.info("Loaded {} domains", domainRanks.size());
isReady = true; isReady = true;
// We can defer these as they only populate a roaringbitmap, and will degrade gracefully when not complete
updateScreenshotInfo();
updateFeedInfo();
} }
} }
catch (SQLException throwables) { catch (SQLException throwables) {
@@ -156,6 +165,42 @@ public class SimilarDomainsService {
} }
} }
private void updateFeedInfo() {
Set<String> feedsDomainNames = new HashSet<>(500_000);
Path readerDbPath = WmsaHome.getDataPath().resolve("rss-feeds.db").toAbsolutePath();
String dbUrl = "jdbc:sqlite:" + readerDbPath;
logger.info("Opening feed db at " + dbUrl);
try (var conn = DriverManager.getConnection(dbUrl);
var stmt = conn.createStatement()) {
var rs = stmt.executeQuery("""
select
json_extract(feed, '$.domain') as domain
from feed
where json_array_length(feed, '$.items') > 0
""");
while (rs.next()) {
feedsDomainNames.add(rs.getString(1));
}
}
catch (SQLException ex) {
logger.error("Failed to read RSS feed items", ex);
}
for (int idx = 0; idx < domainNames.length; idx++) {
String name = domainNames[idx];
if (name == null) {
continue;
}
if (feedsDomainNames.contains(name)) {
feedDomains.add(idx);
}
}
}
private void updateScreenshotInfo() { private void updateScreenshotInfo() {
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
try (var stmt = connection.createStatement()) { try (var stmt = connection.createStatement()) {
@@ -254,6 +299,7 @@ public class SimilarDomainsService {
.setIndexed(indexedDomains.contains(idx)) .setIndexed(indexedDomains.contains(idx))
.setActive(activeDomains.contains(idx)) .setActive(activeDomains.contains(idx))
.setScreenshot(screenshotDomains.contains(idx)) .setScreenshot(screenshotDomains.contains(idx))
.setFeed(feedDomains.contains(idx))
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name())) .setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
.build()); .build());
@@ -369,6 +415,7 @@ public class SimilarDomainsService {
.setIndexed(indexedDomains.contains(idx)) .setIndexed(indexedDomains.contains(idx))
.setActive(activeDomains.contains(idx)) .setActive(activeDomains.contains(idx))
.setScreenshot(screenshotDomains.contains(idx)) .setScreenshot(screenshotDomains.contains(idx))
.setFeed(feedDomains.contains(idx))
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name())) .setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
.build()); .build());

View File

@@ -5,6 +5,7 @@ import com.google.inject.Singleton;
import nu.marginalia.api.livecapture.LiveCaptureApiGrpc.LiveCaptureApiBlockingStub; import nu.marginalia.api.livecapture.LiveCaptureApiGrpc.LiveCaptureApiBlockingStub;
import nu.marginalia.service.client.GrpcChannelPoolFactory; import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool; import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.client.ServiceNotAvailableException;
import nu.marginalia.service.discovery.property.ServiceKey; import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import org.slf4j.Logger; import org.slf4j.Logger;
@@ -29,6 +30,9 @@ public class LiveCaptureClient {
channelPool.call(LiveCaptureApiBlockingStub::requestScreengrab) channelPool.call(LiveCaptureApiBlockingStub::requestScreengrab)
.run(RpcDomainId.newBuilder().setDomainId(domainId).build()); .run(RpcDomainId.newBuilder().setDomainId(domainId).build());
} }
catch (ServiceNotAvailableException e) {
logger.info("requestScreengrab() failed since the service is not available");
}
catch (Exception e) { catch (Exception e) {
logger.error("API Exception", e); logger.error("API Exception", e);
} }

View File

@@ -10,7 +10,6 @@ public class TestXmlSanitization {
Assertions.assertEquals("&amp;", FeedFetcherService.sanitizeEntities("&amp;")); Assertions.assertEquals("&amp;", FeedFetcherService.sanitizeEntities("&amp;"));
Assertions.assertEquals("&lt;", FeedFetcherService.sanitizeEntities("&lt;")); Assertions.assertEquals("&lt;", FeedFetcherService.sanitizeEntities("&lt;"));
Assertions.assertEquals("&gt;", FeedFetcherService.sanitizeEntities("&gt;")); Assertions.assertEquals("&gt;", FeedFetcherService.sanitizeEntities("&gt;"));
Assertions.assertEquals("&quot;", FeedFetcherService.sanitizeEntities("&quot;"));
Assertions.assertEquals("&apos;", FeedFetcherService.sanitizeEntities("&apos;")); Assertions.assertEquals("&apos;", FeedFetcherService.sanitizeEntities("&apos;"));
} }
@@ -23,4 +22,9 @@ public class TestXmlSanitization {
public void testTranslatedHtmlEntity() { public void testTranslatedHtmlEntity() {
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo &mdash; Bar")); Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo &mdash; Bar"));
} }
@Test
public void testTranslatedHtmlEntityQuot() {
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities("&quot;Bob&quot;"));
}
} }

View File

@@ -9,10 +9,9 @@ import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool; import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey; import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.CheckReturnValue; import javax.annotation.CheckReturnValue;
import java.time.Duration;
@Singleton @Singleton
public class QueryClient { public class QueryClient {
@@ -24,13 +23,14 @@ public class QueryClient {
private final GrpcSingleNodeChannelPool<QueryApiGrpc.QueryApiBlockingStub> queryApiPool; private final GrpcSingleNodeChannelPool<QueryApiGrpc.QueryApiBlockingStub> queryApiPool;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public QueryClient(GrpcChannelPoolFactory channelPoolFactory) { public QueryClient(GrpcChannelPoolFactory channelPoolFactory) throws InterruptedException {
this.queryApiPool = channelPoolFactory.createSingle( this.queryApiPool = channelPoolFactory.createSingle(
ServiceKey.forGrpcApi(QueryApiGrpc.class, ServicePartition.any()), ServiceKey.forGrpcApi(QueryApiGrpc.class, ServicePartition.any()),
QueryApiGrpc::newBlockingStub); QueryApiGrpc::newBlockingStub);
// Hold up initialization until we have a downstream connection
this.queryApiPool.awaitChannel(Duration.ofSeconds(5));
} }
@CheckReturnValue @CheckReturnValue

View File

@@ -155,16 +155,25 @@ public class QueryParser {
// Remove trailing punctuation // Remove trailing punctuation
int lastChar = str.charAt(str.length() - 1); int lastChar = str.charAt(str.length() - 1);
if (":.,!?$'".indexOf(lastChar) >= 0) if (":.,!?$'".indexOf(lastChar) >= 0) {
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr())); str = str.substring(0, str.length() - 1);
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
}
// Remove term elements that aren't indexed by the search engine // Remove term elements that aren't indexed by the search engine
if (str.endsWith("'s")) if (str.endsWith("'s")) {
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr())); str = str.substring(0, str.length() - 2);
if (str.endsWith("()")) entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr())); }
if (str.startsWith("$")) if (str.endsWith("()")) {
entity.replace(new QueryToken.LiteralTerm(str.substring(1), lt.displayStr())); str = str.substring(0, str.length() - 2);
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
}
while (str.startsWith("$") || str.startsWith("_")) {
str = str.substring(1);
entity.replace(new QueryToken.LiteralTerm(str, lt.displayStr()));
}
if (entity.isBlank()) { if (entity.isBlank()) {
entity.remove(); entity.remove();

View File

@@ -0,0 +1,32 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.List;
class QueryParserTest {
@Test
// https://github.com/MarginaliaSearch/MarginaliaSearch/issues/140
void parse__builtin_ffs() {
QueryParser parser = new QueryParser();
var tokens = parser.parse("__builtin_ffs");
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("builtin_ffs", "__builtin_ffs")), tokens);
}
@Test
void trailingParens() {
QueryParser parser = new QueryParser();
var tokens = parser.parse("strcpy()");
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("strcpy", "strcpy()")), tokens);
}
@Test
void trailingQuote() {
QueryParser parser = new QueryParser();
var tokens = parser.parse("bob's");
Assertions.assertEquals(List.of(new QueryToken.LiteralTerm("bob", "bob's")), tokens);
}
}

View File

@@ -27,7 +27,7 @@ public class SentenceSegmentSplitter {
else { else {
// If we flatten unicode, we do this... // If we flatten unicode, we do this...
// FIXME: This can almost definitely be cleaned up and simplified. // FIXME: This can almost definitely be cleaned up and simplified.
wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); wordBreakPattern = Pattern.compile("([^/<>$:_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
} }
} }

View File

@@ -28,6 +28,20 @@ class SentenceExtractorTest {
System.out.println(dld); System.out.println(dld);
} }
@Test
void testCplusplus() {
var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class));
assertEquals(1, dld.length());
assertEquals("std::vector", dld.wordsLowerCase[0]);
}
@Test
void testPHP() {
var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class));
assertEquals(1, dld.length());
assertEquals("$_get", dld.wordsLowerCase[0]);
}
@Test @Test
void testPolishArtist() { void testPolishArtist() {
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class)); var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));