mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
234 Commits
deploy-027
...
827aadafcd
Author | SHA1 | Date | |
---|---|---|---|
|
827aadafcd | ||
|
aa7679d6ce | ||
|
6fe6de766d | ||
|
4245ac4c07 | ||
|
1c49a0f5ad | ||
|
9a6e5f646d | ||
|
fa92994a31 | ||
|
bc49406881 | ||
|
90325be447 | ||
|
dc89587af3 | ||
|
7b552afd6b | ||
|
73557edc67 | ||
|
83919e448a | ||
|
6f5b75b84d | ||
|
db315e2813 | ||
|
e9977e08b7 | ||
|
1df3757e5f | ||
|
ca283f9684 | ||
|
85360e61b2 | ||
|
e2ccff21bc | ||
|
c5b5b0c699 | ||
|
9a65946e22 | ||
|
1d2ab21e27 | ||
|
0610cc19ad | ||
|
a676306a7f | ||
|
8d68cd14fb | ||
|
4773c5a52b | ||
|
74bd562ae4 | ||
|
c9751287b0 | ||
|
5da24e3fc4 | ||
|
20a4e86eec | ||
|
477a184948 | ||
|
8940ce99db | ||
|
0ac0fa4dca | ||
|
942f15ef14 | ||
|
f668f33d5b | ||
|
6789975cd2 | ||
|
c3ba608776 | ||
|
733d2687fe | ||
|
f6daac8ed0 | ||
|
c2eeee4a06 | ||
|
3b0c701df4 | ||
|
c6fb2db43b | ||
|
9bc8fe05ae | ||
|
440ffcf6f8 | ||
|
b07709cc72 | ||
|
9a6acdcbe0 | ||
|
23b9b0bf1b | ||
|
749c8ed954 | ||
|
9f4b6939ca | ||
|
1d08e44e8d | ||
|
fc2e156e78 | ||
|
5e68a89e9f | ||
|
d380661307 | ||
|
cccdf5c329 | ||
|
f085b4ea12 | ||
|
e208f7d3ba | ||
|
b577085cb2 | ||
|
b9240476f6 | ||
|
8f50f86d0b | ||
|
e3b7ead7a9 | ||
|
9a845ba604 | ||
|
b9381f1603 | ||
|
6a60127267 | ||
|
e8ffcfbb19 | ||
|
caf0850f81 | ||
|
62e3bb675e | ||
|
4dc3e7da7a | ||
|
92b09883ec | ||
|
87082b4ef8 | ||
|
84d3f6087f | ||
|
f93ba371a5 | ||
|
5eec27c68d | ||
|
ab01576f91 | ||
|
054e5ccf44 | ||
|
4351ea5128 | ||
|
49cfa3a5e9 | ||
|
683854b23f | ||
|
e880fa8945 | ||
|
2482dc572e | ||
|
4589f11898 | ||
|
e43b6e610b | ||
|
4772117a1f | ||
|
3fc7ea521c | ||
|
4372f5af03 | ||
|
4ad89b6c75 | ||
|
ad0519e031 | ||
|
596ece1230 | ||
|
07b6e1585b | ||
|
cb5e2778eb | ||
|
8f5ea7896c | ||
|
76c398e0b1 | ||
|
4a94f04a8d | ||
|
df72f670d4 | ||
|
eaa22c2f5a | ||
|
7be173aeca | ||
|
36685bdca7 | ||
|
ad04057609 | ||
|
eb76ae22e2 | ||
|
4b858ab341 | ||
|
c6e3c8aa3b | ||
|
9128d3907c | ||
|
4ef16d13d4 | ||
|
838a5626ec | ||
|
6b426209c7 | ||
|
452b5731d9 | ||
|
c91cf49630 | ||
|
8503030f18 | ||
|
744f7d3ef7 | ||
|
215e12afe9 | ||
|
2716bce918 | ||
|
caf2e6fbb7 | ||
|
233f0acfb1 | ||
|
e3a4ff02e9 | ||
|
c786283ae1 | ||
|
a3f65ac0e0 | ||
|
aba1a32af0 | ||
|
c9c442345b | ||
|
2e126ba30e | ||
|
2087985f49 | ||
|
2b13ebd18b | ||
|
6d92c125fe | ||
|
f638cfa39a | ||
|
89447c12af | ||
|
c71fc46f04 | ||
|
f96874d828 | ||
|
583a84d5a0 | ||
|
f65b946448 | ||
|
3682815855 | ||
|
3a94357660 | ||
|
673b0d3de1 | ||
|
ea942bc664 | ||
|
7ed5083c54 | ||
|
08bb2c097b | ||
|
495fb325be | ||
|
05c25bbaec | ||
|
2a028b84f3 | ||
|
a091a23623 | ||
|
e8897acb45 | ||
|
b89ffcf2be | ||
|
dbcc9055b0 | ||
|
d9740557f4 | ||
|
0d6cd015fd | ||
|
c6034efcc8 | ||
|
76068014ad | ||
|
1c3ed67127 | ||
|
fc0cb6bd9a | ||
|
c2601bac78 | ||
|
f5641b72e9 | ||
|
36efe2e219 | ||
|
983fe3829e | ||
|
668c87aa86 | ||
|
9d3f9adb05 | ||
|
a43a1773f1 | ||
|
1e7a3a3c4f | ||
|
62b696b1c3 | ||
|
f1a900f383 | ||
|
700364b86d | ||
|
7e725ddaed | ||
|
120209e138 | ||
|
a771a5b6ce | ||
|
dac5b54128 | ||
|
6cfb143c15 | ||
|
23c818281b | ||
|
8aad253cf6 | ||
|
556d7af9dc | ||
|
b7a5219ed3 | ||
|
a23ec521fe | ||
|
fff3babc6d | ||
|
b2bfb8217c | ||
|
3b2ac414dc | ||
|
0ba6515a01 | ||
|
16c6b0f151 | ||
|
e998692900 | ||
|
eeb1695a87 | ||
|
a0ab910940 | ||
|
b9f31048d7 | ||
|
12c304289a | ||
|
6ee01dabea | ||
|
1b80e282a7 | ||
|
a65d18f1d1 | ||
|
90a1ff220b | ||
|
d6c7092335 | ||
|
b716333856 | ||
|
b504b8482c | ||
|
80da1e9ad1 | ||
|
d3f744a441 | ||
|
60fb539875 | ||
|
7f5094fedf | ||
|
45066636a5 | ||
|
e2d6898c51 | ||
|
58ef767b94 | ||
|
f9f268c67a | ||
|
f44c2bdee9 | ||
|
6fdf477c18 | ||
|
6b6e455e3f | ||
|
a3a126540c | ||
|
842b19da40 | ||
|
2a30e93bf0 | ||
|
3d998f12c0 | ||
|
cbccc2ac23 | ||
|
2cfc23f9b7 | ||
|
88fe394cdb | ||
|
f30fcebd4f | ||
|
5d885927b4 | ||
|
7622c8358e | ||
|
69ed9aef47 | ||
|
4c78c223da | ||
|
71b9935dd6 | ||
|
ad38f2fd83 | ||
|
9c47388846 | ||
|
d9ab10e33f | ||
|
e13ea7f42b | ||
|
f38daeb036 | ||
|
6e214293e5 | ||
|
52582a6d7d | ||
|
ec0e39ad32 | ||
|
6a15aee4b0 | ||
|
bd5111e8a2 | ||
|
1ecbeb0272 | ||
|
b91354925d | ||
|
3f85c9c154 | ||
|
89e03d6914 | ||
|
14e0bc9f26 | ||
|
7065b46c6f | ||
|
0372190c90 | ||
|
ceaf32fb90 | ||
|
b57db01415 | ||
|
ce7d522608 | ||
|
18649b6ee9 | ||
|
f6417aef1a | ||
|
2aa7e376b0 | ||
|
f33bc44860 | ||
|
a2826efd44 |
@@ -48,10 +48,6 @@ filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Show favicons next to search results (COMPLETED 2025-03)
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
@@ -5,13 +5,15 @@ import java.util.Collection;
|
||||
public enum HtmlFeature {
|
||||
// Note, the first 32 of these features are bit encoded in the database
|
||||
// so be sure to keep anything that's potentially important toward the top
|
||||
// of the list
|
||||
// of the list; but adding new values will shift the encoded values and break
|
||||
// binary compatibility! Scroll down for a marker where you should add new values
|
||||
// if they need to be accessible from IndexResultScoreCalculator!
|
||||
|
||||
MEDIA( "special:media"),
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING("special:tracking"),
|
||||
TRACKING_ADTECH("special:ads"), // We'll call this ads for now
|
||||
TRACKING_ADTECH("special:adtech"),
|
||||
|
||||
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
||||
LONG_URL("special:longurl"),
|
||||
@@ -30,6 +32,15 @@ public enum HtmlFeature {
|
||||
|
||||
PDF("format:pdf"),
|
||||
|
||||
POPOVER("special:popover"),
|
||||
CONSENT("special:consent"),
|
||||
SHORT_DOCUMENT("special:shorty"),
|
||||
THIRD_PARTY_REQUESTS("special:3pr"),
|
||||
|
||||
// Here! It is generally safe to add additional values here without
|
||||
// disrupting the encoded values used by the DocumentValuator
|
||||
// class in the index!
|
||||
|
||||
/** For fingerprinting and ranking */
|
||||
OPENGRAPH("special:opengraph"),
|
||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||
@@ -67,6 +78,7 @@ public enum HtmlFeature {
|
||||
|
||||
S3_FEATURE("special:s3"),
|
||||
|
||||
MISSING_DOM_SAMPLE("special:nosample"),
|
||||
UNKNOWN("special:uncategorized");
|
||||
|
||||
|
||||
|
@@ -7,7 +7,6 @@ public enum ServiceId {
|
||||
Search("search-service"),
|
||||
Index("index-service"),
|
||||
Query("query-service"),
|
||||
Executor("executor-service"),
|
||||
|
||||
Control("control-service"),
|
||||
|
||||
|
@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.Function;
|
||||
|
||||
@Singleton
|
||||
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
|
||||
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
private static final Executor executor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
@Inject
|
||||
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.service.client;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import io.grpc.ManagedChannel;
|
||||
import io.grpc.StatusRuntimeException;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@@ -206,6 +207,11 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
|
||||
for (var e : exceptions) {
|
||||
if (e instanceof StatusRuntimeException se) {
|
||||
throw se; // Re-throw SRE as-is
|
||||
}
|
||||
|
||||
// If there are other exceptions, log them
|
||||
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
|
||||
}
|
||||
|
||||
|
@@ -1,9 +1,9 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.grpc.Server;
|
||||
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
|
||||
import io.grpc.netty.shaded.io.netty.channel.nio.NioEventLoopGroup;
|
||||
import io.grpc.netty.shaded.io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||
import io.grpc.netty.NettyServerBuilder;
|
||||
import io.netty.channel.nio.NioEventLoopGroup;
|
||||
import io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class GrpcServer {
|
||||
private final Server server;
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
public GrpcServer(ServiceConfiguration config,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
ServicePartition partition,
|
||||
@@ -26,13 +31,19 @@ public class GrpcServer {
|
||||
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
|
||||
|
||||
// Start the gRPC server
|
||||
|
||||
ExecutorService workExecutor = useLoom ?
|
||||
Executors.newVirtualThreadPerTaskExecutor() :
|
||||
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
|
||||
|
||||
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
|
||||
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
|
||||
.executor(workExecutor)
|
||||
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
|
||||
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
|
||||
.channelType(NioServerSocketChannel.class);
|
||||
|
||||
for (var grpcService : grpcServices) {
|
||||
|
||||
if (!grpcService.shouldRegisterService()) {
|
||||
continue;
|
||||
}
|
||||
|
@@ -125,8 +125,7 @@ public class JoobyService {
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
options.setWorkerThreads(Math.min(16, options.getWorkerThreads()));
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -189,7 +189,7 @@ public class ExecutorClient {
|
||||
String uriPath = "/transfer/file/" + fileStorage.id();
|
||||
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
|
||||
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
|
||||
if (endpoints.isEmpty()) {
|
||||
throw new RuntimeException("No endpoints for node " + fileStorage.node());
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -36,7 +37,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +53,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,7 +67,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,7 +81,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +99,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -38,7 +39,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,7 +58,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,7 +74,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +88,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,7 +100,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,14 +115,14 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
responseObserver.onError(new IllegalArgumentException("Export all atags is only available on node 1"));
|
||||
responseObserver.onError(Status.UNAVAILABLE.withDescription("Export all atags is only available on node 1").asRuntimeException());
|
||||
}
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
||||
@@ -131,7 +132,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,7 +146,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,7 +160,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.actor.ActorApi;
|
||||
@@ -58,7 +59,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +71,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -82,7 +83,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,7 +97,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,7 +113,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,7 +129,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -203,7 +204,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,7 +230,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,7 +277,7 @@ public class ExecutorGrpcService
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to update nsfw filters", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -33,7 +34,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,7 +49,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,7 +64,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,7 +79,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +94,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.executor;
|
||||
package nu.marginalia.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.storage.FileStorageService;
|
@@ -1,5 +1,5 @@
|
||||
The execution subsystem is responsible for the execution of long running tasks on each
|
||||
index node. It lives in the [executor-service](../services-core/executor-service) module.
|
||||
index node. It lives in the [index-service](../services-core/index-service) module.
|
||||
|
||||
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
|
||||
which permits program state to survive crashes and reboots.
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.executor;
|
||||
package nu.marginalia.svc;
|
||||
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
@Singleton
|
||||
public class DomainInfoClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public DomainInfoClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -0,0 +1,114 @@
|
||||
package nu.marginalia.api.domsample;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.StatusRuntimeException;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
|
||||
@Singleton
|
||||
public class DomSampleClient {
|
||||
private final GrpcSingleNodeChannelPool<DomSampleApiGrpc.DomSampleApiBlockingStub> channelPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleClient.class);
|
||||
|
||||
@Inject
|
||||
public DomSampleClient(GrpcChannelPoolFactory factory) {
|
||||
|
||||
// The client is only interested in the primary node
|
||||
var key = ServiceKey.forGrpcApi(DomSampleApiGrpc.class, ServicePartition.any());
|
||||
this.channelPool = factory.createSingle(key, DomSampleApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
public Optional<RpcDomainSample> getSample(String domainName) {
|
||||
try {
|
||||
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSample)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
|
||||
return Optional.of(val);
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||
logger.error("Failed to fetch DOM sample", sre);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
|
||||
try {
|
||||
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
|
||||
return Optional.of(val);
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||
logger.error("Failed to fetch DOM sample", sre);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasSample(String domainName) {
|
||||
try {
|
||||
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build())
|
||||
.getAnswer();
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public CompletableFuture<Boolean> hasSample(String domainName, ExecutorService executor) {
|
||||
try {
|
||||
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
||||
.async(executor)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build())
|
||||
.thenApply(RpcBooleanRsp::getAnswer);
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
return CompletableFuture.completedFuture(false);
|
||||
}
|
||||
}
|
||||
|
||||
public CompletableFuture<RpcDomainSample> getSampleAsync(String domainName, ExecutorService executorService) {
|
||||
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSample)
|
||||
.async(executorService)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
}
|
||||
|
||||
public List<RpcDomainSample> getAllSamples(String domainName) {
|
||||
try {
|
||||
Iterator<RpcDomainSample> val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getAllSamples)
|
||||
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||
|
||||
List<RpcDomainSample> ret = new ArrayList<>();
|
||||
val.forEachRemaining(ret::add);
|
||||
return ret;
|
||||
}
|
||||
catch (StatusRuntimeException sre) {
|
||||
logger.error("Failed to fetch DOM sample");
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean waitReady(Duration duration) throws InterruptedException {
|
||||
return channelPool.awaitChannel(duration);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -24,7 +24,9 @@ import java.util.function.BiConsumer;
|
||||
|
||||
@Singleton
|
||||
public class FeedsClient {
|
||||
private final ExecutorService executorService = Executors.newCachedThreadPool();
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executorService = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||
|
||||
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
|
||||
private final MqOutbox updateFeedsOutbox;
|
||||
|
||||
|
@@ -0,0 +1,47 @@
|
||||
syntax="proto3";
|
||||
package nu.marginalia.api.domsample;
|
||||
|
||||
option java_package="nu.marginalia.api.domsample";
|
||||
option java_multiple_files=true;
|
||||
|
||||
|
||||
service DomSampleApi {
|
||||
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
|
||||
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
|
||||
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
|
||||
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
|
||||
}
|
||||
|
||||
message RpcDomainName {
|
||||
string domainName = 1;
|
||||
}
|
||||
|
||||
message RpcBooleanRsp {
|
||||
bool answer = 1;
|
||||
}
|
||||
|
||||
message RpcDomainSampleRequests {
|
||||
string domainName = 1;
|
||||
string url = 2;
|
||||
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||
}
|
||||
|
||||
message RpcDomainSample {
|
||||
string domainName = 1;
|
||||
string url = 2;
|
||||
bytes htmlSampleZstd = 3;
|
||||
bool accepted_popover = 4;
|
||||
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||
}
|
||||
|
||||
message RpcOutgoingRequest {
|
||||
RequestMethod method = 1;
|
||||
int64 timestamp = 2;
|
||||
string url = 3;
|
||||
|
||||
enum RequestMethod {
|
||||
GET = 0;
|
||||
POST = 1;
|
||||
OTHER = 2;
|
||||
};
|
||||
}
|
@@ -31,6 +31,7 @@ dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.slop
|
||||
implementation libs.zstd
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
|
@@ -0,0 +1,176 @@
|
||||
package nu.marginalia.domsample;
|
||||
|
||||
import com.github.luben.zstd.Zstd;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.protobuf.ByteString;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.domsample.*;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
public class DomSampleGrpcService
|
||||
extends DomSampleApiGrpc.DomSampleApiImplBase
|
||||
implements DiscoverableService
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleGrpcService.class);
|
||||
|
||||
private final DomSampleDb domSampleDb;
|
||||
|
||||
@Inject
|
||||
public DomSampleGrpcService(DomSampleDb domSampleDb) {
|
||||
this.domSampleDb = domSampleDb;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getSample(RpcDomainName request, StreamObserver<RpcDomainSample> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||
if (dbRecords.isEmpty()) {
|
||||
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
// Grab the first sample
|
||||
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
|
||||
|
||||
responseObserver.onNext(response.build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error in getSample()", e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||
if (dbRecords.isEmpty()) {
|
||||
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
// Grab the first sample
|
||||
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
|
||||
|
||||
responseObserver.onNext(response.build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error in getSample()", e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void hasSample(RpcDomainName request, StreamObserver<RpcBooleanRsp> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
responseObserver.onNext(RpcBooleanRsp.newBuilder()
|
||||
.setAnswer(domSampleDb.hasSample(domainName)).build());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getAllSamples(RpcDomainName request, StreamObserver<RpcDomainSample> responseObserver) {
|
||||
String domainName = request.getDomainName();
|
||||
if (domainName.isBlank()) {
|
||||
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||
.withDescription("Invalid domain name")
|
||||
.asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||
|
||||
for (var record : dbRecords) {
|
||||
responseObserver.onNext(convertFullSample(record).build());
|
||||
}
|
||||
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error in getSample()", e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
|
||||
|
||||
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
|
||||
|
||||
var sampleBuilder = RpcDomainSample.newBuilder()
|
||||
.setDomainName(dbSample.domain())
|
||||
.setAcceptedPopover(dbSample.acceptedPopover())
|
||||
.setHtmlSampleZstd(htmlZstd);
|
||||
|
||||
for (var req : dbSample.parseRequests()) {
|
||||
sampleBuilder.addOutgoingRequestsBuilder()
|
||||
.setUrl(req.uri().toString())
|
||||
.setMethod(switch (req.method().toUpperCase())
|
||||
{
|
||||
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||
})
|
||||
.setTimestamp(req.timestamp());
|
||||
}
|
||||
|
||||
return sampleBuilder;
|
||||
}
|
||||
|
||||
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
|
||||
|
||||
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
|
||||
.setDomainName(dbSample.domain());
|
||||
|
||||
for (var req : dbSample.parseRequests()) {
|
||||
sampleBuilder.addOutgoingRequestsBuilder()
|
||||
.setUrl(req.uri().toString())
|
||||
.setMethod(switch (req.method().toUpperCase())
|
||||
{
|
||||
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||
})
|
||||
.setTimestamp(req.timestamp());
|
||||
}
|
||||
|
||||
return sampleBuilder;
|
||||
}
|
||||
}
|
@@ -1,17 +1,28 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class DomSampleDb implements AutoCloseable {
|
||||
private static final String dbFileName = "dom-sample.db";
|
||||
private final Connection connection;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleDb.class);
|
||||
|
||||
public DomSampleDb() throws SQLException{
|
||||
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||
@@ -88,14 +99,78 @@ public class DomSampleDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {
|
||||
|
||||
public List<SampleRequest> parseRequests() {
|
||||
List<SampleRequest> requests = new ArrayList<>();
|
||||
|
||||
// Request format is METHOD\tTIMESTAMP\tURI\n
|
||||
|
||||
for (var line : StringUtils.split(this.requests, '\n')) {
|
||||
String[] parts = StringUtils.split(line, "\t", 3);
|
||||
if (parts.length != 3) continue;
|
||||
|
||||
try {
|
||||
String method = parts[0];
|
||||
long ts = Long.parseLong(parts[1]);
|
||||
String linkUrl = parts[2];
|
||||
|
||||
URI uri = parseURI(linkUrl);
|
||||
|
||||
requests.add(new SampleRequest(method, ts, uri));
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Failed to parse requests", e);
|
||||
}
|
||||
}
|
||||
|
||||
return requests;
|
||||
}
|
||||
|
||||
|
||||
private static URI parseURI(String uri) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(uri);
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
return new EdgeUrl(uri).asURI();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public record SampleRequest(String method, long timestamp, URI uri) {}
|
||||
|
||||
/**
|
||||
* @param consumer - consume the sample, return true to continue consumption
|
||||
* @throws SQLException
|
||||
*/
|
||||
public void forEachSample(Predicate<Sample> consumer) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT url, domain, sample, requests, accepted_popover
|
||||
FROM samples
|
||||
"""))
|
||||
{
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
var sample = new Sample(
|
||||
rs.getString("url"),
|
||||
rs.getString("domain"),
|
||||
rs.getString("sample"),
|
||||
rs.getString("requests"),
|
||||
rs.getBoolean("accepted_popover")
|
||||
);
|
||||
|
||||
if (!consumer.test(sample)) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<Sample> getSamples(String domain) throws SQLException {
|
||||
List<Sample> samples = new ArrayList<>();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT url, sample, requests, accepted_popover
|
||||
FROM samples
|
||||
FROM samples
|
||||
WHERE domain = ?
|
||||
"""))
|
||||
{
|
||||
@@ -116,6 +191,21 @@ public class DomSampleDb implements AutoCloseable {
|
||||
return samples;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasSample(String domain) throws SQLException {
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT 1
|
||||
FROM samples
|
||||
WHERE domain = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
return rs.next();
|
||||
}
|
||||
}
|
||||
|
||||
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||
var doc = Jsoup.parse(rawContent);
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.feeds.*;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
@@ -69,7 +70,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
@Override
|
||||
public void getFeedDataHash(Empty request, StreamObserver<RpcFeedDataHash> responseObserver) {
|
||||
if (!feedDb.isEnabled()) {
|
||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
||||
responseObserver.onError(Status.INTERNAL.withDescription("Feed database is disabled on this node").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -80,7 +81,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting feed data hash", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,7 +102,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting updated links", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,13 +110,13 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
public void getFeed(RpcDomainId request,
|
||||
StreamObserver<RpcFeed> responseObserver) {
|
||||
if (!feedDb.isEnabled()) {
|
||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
||||
responseObserver.onError(Status.INTERNAL.withDescription("Feed database is disabled on this node").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
Optional<EdgeDomain> domainName = domainQueries.getDomain(request.getDomainId());
|
||||
if (domainName.isEmpty()) {
|
||||
responseObserver.onError(new IllegalArgumentException("Domain not found"));
|
||||
responseObserver.onError(Status.NOT_FOUND.withDescription("Domain not found").asRuntimeException());
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -87,7 +87,7 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
|
||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
|
||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Index, 1, "", "", 0, UUID.randomUUID()));
|
||||
bind(Integer.class).annotatedWith(Names.named("wmsa-system-node")).toInstance(1);
|
||||
}
|
||||
|
||||
|
@@ -26,7 +26,9 @@ public class MathClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(MathClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<MathApiGrpc.MathApiBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public MathClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -304,7 +304,6 @@ public class QueryProtobufCodec {
|
||||
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
|
||||
specs.getDomainsList(),
|
||||
specs.getSearchSetIdentifier(),
|
||||
specs.getHumanQuery(),
|
||||
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
|
||||
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
||||
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
||||
|
@@ -18,8 +18,6 @@ public class SearchSpecification {
|
||||
|
||||
public String searchSetIdentifier;
|
||||
|
||||
public final String humanQuery;
|
||||
|
||||
public SpecificationLimit quality;
|
||||
public SpecificationLimit year;
|
||||
public SpecificationLimit size;
|
||||
@@ -35,7 +33,6 @@ public class SearchSpecification {
|
||||
public SearchSpecification(SearchQuery query,
|
||||
List<Integer> domains,
|
||||
String searchSetIdentifier,
|
||||
String humanQuery,
|
||||
SpecificationLimit quality,
|
||||
SpecificationLimit year,
|
||||
SpecificationLimit size,
|
||||
@@ -47,7 +44,6 @@ public class SearchSpecification {
|
||||
this.query = query;
|
||||
this.domains = domains;
|
||||
this.searchSetIdentifier = searchSetIdentifier;
|
||||
this.humanQuery = humanQuery;
|
||||
this.quality = quality;
|
||||
this.year = year;
|
||||
this.size = size;
|
||||
@@ -73,10 +69,6 @@ public class SearchSpecification {
|
||||
return this.searchSetIdentifier;
|
||||
}
|
||||
|
||||
public String getHumanQuery() {
|
||||
return this.humanQuery;
|
||||
}
|
||||
|
||||
public SpecificationLimit getQuality() {
|
||||
return this.quality;
|
||||
}
|
||||
@@ -106,14 +98,13 @@ public class SearchSpecification {
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
|
||||
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
|
||||
}
|
||||
|
||||
public static class SearchSpecificationBuilder {
|
||||
private SearchQuery query;
|
||||
private List<Integer> domains;
|
||||
private String searchSetIdentifier;
|
||||
private String humanQuery;
|
||||
private SpecificationLimit quality$value;
|
||||
private boolean quality$set;
|
||||
private SpecificationLimit year$value;
|
||||
@@ -144,11 +135,6 @@ public class SearchSpecification {
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder humanQuery(String humanQuery) {
|
||||
this.humanQuery = humanQuery;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
|
||||
this.quality$value = quality;
|
||||
this.quality$set = true;
|
||||
@@ -205,11 +191,7 @@ public class SearchSpecification {
|
||||
if (!this.rank$set) {
|
||||
rank$value = SpecificationLimit.none();
|
||||
}
|
||||
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
|
||||
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,56 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
public class ResultRankingContext {
|
||||
private final int docCount;
|
||||
public final RpcResultRankingParameters params;
|
||||
|
||||
|
||||
public final BitSet regularMask;
|
||||
public final BitSet ngramsMask;
|
||||
|
||||
/** CqDataInt associated with frequency information of the terms in the query
|
||||
* in the full index. The dataset is indexed by the compiled query. */
|
||||
public final CqDataInt fullCounts;
|
||||
|
||||
/** CqDataInt associated with frequency information of the terms in the query
|
||||
* in the full index. The dataset is indexed by the compiled query. */
|
||||
public final CqDataInt priorityCounts;
|
||||
|
||||
public ResultRankingContext(int docCount,
|
||||
RpcResultRankingParameters params,
|
||||
BitSet ngramsMask,
|
||||
BitSet regularMask,
|
||||
CqDataInt fullCounts,
|
||||
CqDataInt prioCounts)
|
||||
{
|
||||
this.docCount = docCount;
|
||||
this.params = params;
|
||||
|
||||
this.ngramsMask = ngramsMask;
|
||||
this.regularMask = regularMask;
|
||||
|
||||
this.fullCounts = fullCounts;
|
||||
this.priorityCounts = prioCounts;
|
||||
}
|
||||
|
||||
public int termFreqDocCount() {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ResultRankingContext{" +
|
||||
"docCount=" + docCount +
|
||||
", params=" + params +
|
||||
", regularMask=" + regularMask +
|
||||
", ngramsMask=" + ngramsMask +
|
||||
", fullCounts=" + fullCounts +
|
||||
", priorityCounts=" + priorityCounts +
|
||||
'}';
|
||||
}
|
||||
}
|
@@ -34,8 +34,6 @@ public class QueryFactory {
|
||||
this.queryExpansion = queryExpansion;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public ProcessedQuery createQuery(QueryParams params,
|
||||
@Nullable RpcResultRankingParameters rankingParams) {
|
||||
final var query = params.humanQuery();
|
||||
@@ -153,7 +151,6 @@ public class QueryFactory {
|
||||
|
||||
var specsBuilder = SearchSpecification.builder()
|
||||
.query(queryBuilder.build())
|
||||
.humanQuery(query)
|
||||
.quality(qualityLimit)
|
||||
.year(year)
|
||||
.size(size)
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.functions.searchquery;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
@@ -93,7 +94,7 @@ public class QueryGRPCService
|
||||
});
|
||||
} catch (Exception e) {
|
||||
logger.error("Exception", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -241,7 +241,6 @@ public class QueryFactoryTest {
|
||||
|
||||
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
|
||||
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
|
||||
Assertions.assertEquals("\"bob's cars\"", subquery.humanQuery);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@@ -38,7 +38,9 @@ public class IndexClient {
|
||||
.help("Count of results filtered by NSFW tier")
|
||||
.register();
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||
|
@@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:native')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
@@ -1,9 +1,11 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ffi.LinuxSystemCalls;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
||||
import nu.marginalia.index.forward.spans.IndexSpansReader;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -22,16 +24,15 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
|
||||
* and a mapping between document identifiers to the index into the
|
||||
* data array.
|
||||
* <p/>
|
||||
* Since the total data is relatively small, this is kept in memory to
|
||||
* reduce the amount of disk thrashing.
|
||||
* <p/>
|
||||
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
|
||||
*/
|
||||
public class ForwardIndexReader {
|
||||
private final LongArray ids;
|
||||
private final LongArray data;
|
||||
|
||||
private final ForwardIndexSpansReader spansReader;
|
||||
private volatile Long2IntOpenHashMap idsMap;
|
||||
|
||||
private final IndexSpansReader spansReader;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@@ -64,7 +65,22 @@ public class ForwardIndexReader {
|
||||
|
||||
ids = loadIds(idsFile);
|
||||
data = loadData(dataFile);
|
||||
spansReader = new ForwardIndexSpansReader(spansFile);
|
||||
|
||||
LinuxSystemCalls.madviseRandom(data.getMemorySegment());
|
||||
LinuxSystemCalls.madviseRandom(ids.getMemorySegment());
|
||||
|
||||
spansReader = IndexSpansReader.open(spansFile);
|
||||
|
||||
Thread.ofPlatform().start(this::createIdsMap);
|
||||
}
|
||||
|
||||
private void createIdsMap() {
|
||||
Long2IntOpenHashMap idsMap = new Long2IntOpenHashMap((int) ids.size());
|
||||
for (int i = 0; i < ids.size(); i++) {
|
||||
idsMap.put(ids.get(i), i);
|
||||
}
|
||||
this.idsMap = idsMap;
|
||||
logger.info("Forward index loaded into RAM");
|
||||
}
|
||||
|
||||
private static LongArray loadIds(Path idsFile) throws IOException {
|
||||
@@ -106,7 +122,11 @@ public class ForwardIndexReader {
|
||||
private int idxForDoc(long docId) {
|
||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||
|
||||
long offset = ids.binarySearch(docId, 0, ids.size());
|
||||
if (idsMap != null) {
|
||||
return idsMap.getOrDefault(docId, -1);
|
||||
}
|
||||
|
||||
long offset = ids.binarySearch2(docId, 0, ids.size());
|
||||
|
||||
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
|
||||
if (getClass().desiredAssertionStatus()) {
|
||||
@@ -118,22 +138,27 @@ public class ForwardIndexReader {
|
||||
return (int) offset;
|
||||
}
|
||||
|
||||
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return new DocumentSpans();
|
||||
|
||||
long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
|
||||
public DocumentSpans[] getDocumentSpans(Arena arena, long[] docIds) {
|
||||
long[] offsets = new long[docIds.length];
|
||||
for (int i = 0; i < docIds.length; i++) {
|
||||
long offset = idxForDoc(docIds[i]);
|
||||
if (offset >= 0) {
|
||||
offsets[i] = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
|
||||
}
|
||||
else {
|
||||
offsets[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
return spansReader.readSpans(arena, encodedOffset);
|
||||
return spansReader.readSpans(arena, offsets);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read spans for doc " + docId, ex);
|
||||
return new DocumentSpans();
|
||||
logger.error("Failed to read spans for docIds", ex);
|
||||
return new DocumentSpans[docIds.length];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public int totalDocCount() {
|
||||
return (int) ids.size();
|
||||
}
|
||||
@@ -141,6 +166,8 @@ public class ForwardIndexReader {
|
||||
public void close() {
|
||||
if (data != null)
|
||||
data.close();
|
||||
if (ids != null)
|
||||
ids.close();
|
||||
}
|
||||
|
||||
public boolean isLoaded() {
|
||||
|
@@ -5,7 +5,7 @@ import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexParameters;
|
||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
||||
import nu.marginalia.index.forward.spans.IndexSpansWriter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
@@ -65,7 +65,7 @@ public class ForwardIndexConverter {
|
||||
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||
|
||||
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter");
|
||||
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
|
||||
var spansWriter = new IndexSpansWriter(outputFileSpansData)
|
||||
) {
|
||||
progress.progress(TaskSteps.GET_DOC_IDS);
|
||||
|
||||
|
@@ -11,6 +11,9 @@ public class DocumentSpan {
|
||||
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||
private final IntList startsEnds;
|
||||
|
||||
public DocumentSpan(IntList startsEnds) {
|
||||
this.startsEnds = startsEnds;
|
||||
}
|
||||
public DocumentSpan(CodedSequence startsEnds) {
|
||||
this.startsEnds = startsEnds.values();
|
||||
}
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
|
||||
@@ -39,6 +40,23 @@ public class DocumentSpans {
|
||||
return EMPTY_SPAN;
|
||||
}
|
||||
|
||||
void accept(byte code, IntList positions) {
|
||||
if (code == HtmlTag.HEADING.code)
|
||||
this.heading = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.TITLE.code)
|
||||
this.title = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.NAV.code)
|
||||
this.nav = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.CODE.code)
|
||||
this.code = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.ANCHOR.code)
|
||||
this.anchor = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
|
||||
this.externalLinkText = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.BODY.code)
|
||||
this.body = new DocumentSpan(positions);
|
||||
}
|
||||
|
||||
void accept(byte code, CodedSequence positions) {
|
||||
if (code == HtmlTag.HEADING.code)
|
||||
this.heading = new DocumentSpan(positions);
|
||||
|
@@ -0,0 +1,25 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public interface IndexSpansReader extends AutoCloseable {
|
||||
DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException;
|
||||
DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException;
|
||||
|
||||
static IndexSpansReader open(Path fileName) throws IOException {
|
||||
int version = SpansCodec.parseSpanFilesFooter(fileName);
|
||||
if (version == SpansCodec.SpansCodecVersion.COMPRESSED.ordinal()) {
|
||||
return new IndexSpansReaderCompressed(fileName);
|
||||
}
|
||||
else if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
|
||||
return new IndexSpansReaderPlain(fileName);
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Unsupported spans file version: " + version);
|
||||
}
|
||||
}
|
||||
|
||||
void close() throws IOException;
|
||||
}
|
@@ -10,11 +10,11 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
@SuppressWarnings("preview")
|
||||
public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
@Deprecated
|
||||
public class IndexSpansReaderCompressed implements AutoCloseable, IndexSpansReader {
|
||||
private final FileChannel spansFileChannel;
|
||||
|
||||
public ForwardIndexSpansReader(Path spansFile) throws IOException {
|
||||
public IndexSpansReaderCompressed(Path spansFile) throws IOException {
|
||||
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||
}
|
||||
|
||||
@@ -51,6 +51,17 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
|
||||
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||
for (int i = 0; i < encodedOffsets.length; i++) {
|
||||
if (encodedOffsets[i] >= 0) {
|
||||
ret[i] = readSpans(arena, encodedOffsets[i]);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
spansFileChannel.close();
|
@@ -0,0 +1,106 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import nu.marginalia.uring.UringFileReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
public class IndexSpansReaderPlain implements IndexSpansReader {
|
||||
private final UringFileReader uringReader;
|
||||
|
||||
public IndexSpansReaderPlain(Path spansFile) throws IOException {
|
||||
if (Boolean.getBoolean("index.directModePositionsSpans")) {
|
||||
if ((Files.size(spansFile) & 4095) != 0) {
|
||||
throw new IllegalArgumentException("Spans file is not block aligned in size: " + Files.size(spansFile));
|
||||
}
|
||||
|
||||
uringReader = new UringFileReader(spansFile, true);
|
||||
}
|
||||
else {
|
||||
uringReader = new UringFileReader(spansFile, false);
|
||||
uringReader.fadviseWillneed();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||
// for testing, slow
|
||||
return readSpans(arena, new long[] { encodedOffset})[0];
|
||||
}
|
||||
|
||||
public DocumentSpans decode(MemorySegment ms) {
|
||||
int count = ms.get(ValueLayout.JAVA_INT, 0);
|
||||
int pos = 4;
|
||||
DocumentSpans ret = new DocumentSpans();
|
||||
|
||||
// Decode each span
|
||||
for (int spanIdx = 0; spanIdx < count; spanIdx++) {
|
||||
byte code = ms.get(ValueLayout.JAVA_BYTE, pos);
|
||||
short len = ms.get(ValueLayout.JAVA_SHORT, pos+2);
|
||||
|
||||
IntArrayList values = new IntArrayList(len);
|
||||
|
||||
pos += 4;
|
||||
for (int i = 0; i < len; i++) {
|
||||
values.add(ms.get(ValueLayout.JAVA_INT, pos + 4*i));
|
||||
}
|
||||
ret.accept(code, values);
|
||||
pos += 4*len;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) {
|
||||
|
||||
int readCnt = 0;
|
||||
for (long offset : encodedOffsets) {
|
||||
if (offset < 0)
|
||||
continue;
|
||||
readCnt ++;
|
||||
}
|
||||
|
||||
if (readCnt == 0) {
|
||||
return new DocumentSpans[encodedOffsets.length];
|
||||
}
|
||||
|
||||
long[] offsets = new long[readCnt];
|
||||
int[] sizes = new int[readCnt];
|
||||
|
||||
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
|
||||
if (encodedOffsets[idx] < 0)
|
||||
continue;
|
||||
long offset = encodedOffsets[idx];
|
||||
|
||||
offsets[j] = SpansCodec.decodeStartOffset(offset);
|
||||
sizes[j] = SpansCodec.decodeSize(offset);
|
||||
j++;
|
||||
}
|
||||
|
||||
List<MemorySegment> buffers = uringReader.readUnaligned(arena, offsets, sizes, 4096);
|
||||
|
||||
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||
|
||||
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
|
||||
if (encodedOffsets[idx] < 0)
|
||||
continue;
|
||||
ret[idx] = decode(buffers.get(j++));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
uringReader.close();
|
||||
}
|
||||
|
||||
}
|
@@ -1,20 +1,23 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class ForwardIndexSpansWriter implements AutoCloseable {
|
||||
public class IndexSpansWriter implements AutoCloseable {
|
||||
private final FileChannel outputChannel;
|
||||
private final ByteBuffer work = ByteBuffer.allocate(32);
|
||||
private final ByteBuffer work = ByteBuffer.allocate(4*1024*1024).order(ByteOrder.nativeOrder());
|
||||
|
||||
private long stateStartOffset = -1;
|
||||
private int stateLength = -1;
|
||||
|
||||
public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException {
|
||||
public IndexSpansWriter(Path outputFileSpansData) throws IOException {
|
||||
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
|
||||
}
|
||||
|
||||
@@ -23,7 +26,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
||||
stateLength = 0;
|
||||
|
||||
work.clear();
|
||||
work.put((byte) count);
|
||||
work.putInt(count);
|
||||
work.flip();
|
||||
|
||||
while (work.hasRemaining())
|
||||
@@ -33,12 +36,17 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
||||
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
|
||||
work.clear();
|
||||
work.put(spanCode);
|
||||
work.putShort((short) sequenceData.remaining());
|
||||
work.put((byte) 0); // Ensure we're byte aligned
|
||||
var sequence = new VarintCodedSequence(sequenceData);
|
||||
work.putShort((short) sequence.valueCount());
|
||||
|
||||
var iter = sequence.iterator();
|
||||
while (iter.hasNext()) {
|
||||
work.putInt(iter.nextInt());
|
||||
}
|
||||
work.flip();
|
||||
|
||||
while (work.hasRemaining() || sequenceData.hasRemaining()) {
|
||||
stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData});
|
||||
}
|
||||
stateLength += outputChannel.write(work);
|
||||
}
|
||||
|
||||
public long endRecord() {
|
||||
@@ -47,6 +55,11 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
ByteBuffer footer = SpansCodec.createSpanFilesFooter(SpansCodec.SpansCodecVersion.PLAIN, (int) (4096 - (outputChannel.position() & 4095)));
|
||||
outputChannel.position(outputChannel.size());
|
||||
while (footer.hasRemaining()) {
|
||||
outputChannel.write(footer, outputChannel.size());
|
||||
}
|
||||
outputChannel.close();
|
||||
}
|
||||
}
|
@@ -1,6 +1,21 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class SpansCodec {
|
||||
public static int MAGIC_INT = 0xF000F000;
|
||||
public static int FOOTER_SIZE = 8;
|
||||
|
||||
public enum SpansCodecVersion {
|
||||
@Deprecated
|
||||
COMPRESSED,
|
||||
PLAIN
|
||||
}
|
||||
|
||||
public static long encode(long startOffset, long size) {
|
||||
assert size < 0x1000_0000L : "Size must be less than 2^28";
|
||||
|
||||
@@ -11,7 +26,39 @@ public class SpansCodec {
|
||||
return encoded >>> 28;
|
||||
}
|
||||
|
||||
public static long decodeSize(long encoded) {
|
||||
return encoded & 0x0FFF_FFFFL;
|
||||
public static int decodeSize(long encoded) {
|
||||
return (int) (encoded & 0x0FFF_FFFFL);
|
||||
}
|
||||
|
||||
public static ByteBuffer createSpanFilesFooter(SpansCodecVersion version, int padSize) {
|
||||
if (padSize < FOOTER_SIZE) {
|
||||
padSize += 4096;
|
||||
}
|
||||
|
||||
ByteBuffer footer = ByteBuffer.allocate(padSize);
|
||||
footer.position(padSize - FOOTER_SIZE);
|
||||
footer.putInt(SpansCodec.MAGIC_INT);
|
||||
footer.put((byte) version.ordinal());
|
||||
footer.put((byte) 0);
|
||||
footer.put((byte) 0);
|
||||
footer.put((byte) 0);
|
||||
footer.flip();
|
||||
return footer;
|
||||
}
|
||||
|
||||
public static int parseSpanFilesFooter(Path spansFile) throws IOException {
|
||||
ByteBuffer buffer = ByteBuffer.allocate(FOOTER_SIZE);
|
||||
|
||||
try (var fc = FileChannel.open(spansFile, StandardOpenOption.READ)) {
|
||||
if (fc.size() < FOOTER_SIZE) return 0;
|
||||
fc.read(buffer, fc.size() - buffer.capacity());
|
||||
buffer.flip();
|
||||
int magic = buffer.getInt();
|
||||
if (magic != MAGIC_INT) {
|
||||
return 0;
|
||||
}
|
||||
return buffer.get();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -1,8 +1,9 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
||||
import nu.marginalia.index.forward.spans.IndexSpansReader;
|
||||
import nu.marginalia.index.forward.spans.IndexSpansReaderPlain;
|
||||
import nu.marginalia.index.forward.spans.IndexSpansWriter;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@@ -17,10 +18,10 @@ import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class ForwardIndexSpansReaderTest {
|
||||
class IndexSpansReaderTest {
|
||||
Path testFile = Files.createTempFile("test", ".idx");
|
||||
|
||||
ForwardIndexSpansReaderTest() throws IOException {
|
||||
IndexSpansReaderTest() throws IOException {
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
@@ -34,7 +35,7 @@ class ForwardIndexSpansReaderTest {
|
||||
|
||||
long offset1;
|
||||
long offset2;
|
||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
||||
try (var writer = new IndexSpansWriter(testFile)) {
|
||||
writer.beginRecord(1);
|
||||
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate(1, 3, 5, 8).buffer());
|
||||
offset1 = writer.endRecord();
|
||||
@@ -46,7 +47,7 @@ class ForwardIndexSpansReaderTest {
|
||||
offset2 = writer.endRecord();
|
||||
}
|
||||
|
||||
try (var reader = new ForwardIndexSpansReader(testFile);
|
||||
try (var reader = IndexSpansReader.open(testFile);
|
||||
var arena = Arena.ofConfined()
|
||||
) {
|
||||
var spans1 = reader.readSpans(arena, offset1);
|
||||
@@ -77,13 +78,13 @@ class ForwardIndexSpansReaderTest {
|
||||
@Test
|
||||
void testContainsRange() throws IOException {
|
||||
long offset1;
|
||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
||||
try (var writer = new IndexSpansWriter(testFile)) {
|
||||
writer.beginRecord(1);
|
||||
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
||||
offset1 = writer.endRecord();
|
||||
}
|
||||
|
||||
try (var reader = new ForwardIndexSpansReader(testFile);
|
||||
try (var reader = new IndexSpansReaderPlain(testFile);
|
||||
var arena = Arena.ofConfined()
|
||||
) {
|
||||
var spans1 = reader.readSpans(arena, offset1);
|
||||
@@ -104,13 +105,13 @@ class ForwardIndexSpansReaderTest {
|
||||
@Test
|
||||
void testContainsRangeExact() throws IOException {
|
||||
long offset1;
|
||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
||||
try (var writer = new IndexSpansWriter(testFile)) {
|
||||
writer.beginRecord(1);
|
||||
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
||||
offset1 = writer.endRecord();
|
||||
}
|
||||
|
||||
try (var reader = new ForwardIndexSpansReader(testFile);
|
||||
try (var reader = new IndexSpansReaderPlain(testFile);
|
||||
var arena = Arena.ofConfined()
|
||||
) {
|
||||
var spans1 = reader.readSpans(arena, offset1);
|
||||
@@ -131,13 +132,13 @@ class ForwardIndexSpansReaderTest {
|
||||
@Test
|
||||
void testCountRangeMatches() throws IOException {
|
||||
long offset1;
|
||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
||||
try (var writer = new IndexSpansWriter(testFile)) {
|
||||
writer.beginRecord(1);
|
||||
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
||||
offset1 = writer.endRecord();
|
||||
}
|
||||
|
||||
try (var reader = new ForwardIndexSpansReader(testFile);
|
||||
try (var reader = new IndexSpansReaderPlain(testFile);
|
||||
var arena = Arena.ofConfined()
|
||||
) {
|
||||
var spans1 = reader.readSpans(arena, offset1);
|
54
code/index/index-perftest/build.gradle
Normal file
54
code/index/index-perftest/build.gradle
Normal file
@@ -0,0 +1,54 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.index.perftest.PerfTestMain'
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:db')
|
||||
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:native')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:common:linkdb')
|
||||
implementation project(':code:index')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-forward')
|
||||
implementation project(':code:index:index-reverse')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation libs.slop
|
||||
implementation libs.roaringbitmap
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
}
|
@@ -0,0 +1,262 @@
|
||||
package nu.marginalia.index.perftest;
|
||||
|
||||
import nu.marginalia.ffi.LinuxSystemCalls;
|
||||
import nu.marginalia.uring.UringFileReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class IoPatternsMain {
|
||||
|
||||
static void testBuffered(int sz, int small, int large, int iters) {
|
||||
try {
|
||||
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
|
||||
long fileSize = Files.size(largeFile);
|
||||
|
||||
Random r = new Random();
|
||||
List<MemorySegment> segments = new ArrayList<>();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
if (small == large) {
|
||||
segments.add(Arena.ofAuto().allocate(small));
|
||||
}
|
||||
else {
|
||||
segments.add(Arena.ofAuto().allocate(r.nextInt(small, large)));
|
||||
}
|
||||
}
|
||||
List<Long> offsets = new ArrayList<>();
|
||||
|
||||
long[] samples = new long[1000];
|
||||
int si = 0;
|
||||
|
||||
try (UringFileReader reader = new UringFileReader(largeFile, false)) {
|
||||
for (int iter = 0; iter < iters; ) {
|
||||
if (si == samples.length) {
|
||||
Arrays.sort(samples);
|
||||
double p1 = samples[10] / 1_000.;
|
||||
double p10 = samples[100] / 1_000.;
|
||||
double p90 = samples[900] / 1_000.;
|
||||
double p99 = samples[990] / 1_000.;
|
||||
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
|
||||
System.out.println("B"+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
|
||||
si = 0;
|
||||
iter++;
|
||||
}
|
||||
|
||||
offsets.clear();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
offsets.add(r.nextLong(0, fileSize - 256));
|
||||
}
|
||||
|
||||
long st = System.nanoTime();
|
||||
reader.read(segments, offsets);
|
||||
long et = System.nanoTime();
|
||||
|
||||
samples[si++] = et - st;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
static void testBufferedPread(int sz, int iters) {
|
||||
try {
|
||||
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
|
||||
long fileSize = Files.size(largeFile);
|
||||
|
||||
Random r = new Random();
|
||||
List<MemorySegment> segments = new ArrayList<>();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
segments.add(Arena.ofAuto().allocate(r.nextInt(24, 256)));
|
||||
}
|
||||
List<Long> offsets = new ArrayList<>();
|
||||
|
||||
long[] samples = new long[1000];
|
||||
int si = 0;
|
||||
|
||||
int fd = -1;
|
||||
try {
|
||||
fd = LinuxSystemCalls.openBuffered(largeFile);
|
||||
LinuxSystemCalls.fadviseRandom(fd);
|
||||
|
||||
for (int iter = 0; iter < iters; ) {
|
||||
if (si == samples.length) {
|
||||
Arrays.sort(samples);
|
||||
double p1 = samples[10] / 1_000.;
|
||||
double p10 = samples[100] / 1_000.;
|
||||
double p90 = samples[900] / 1_000.;
|
||||
double p99 = samples[990] / 1_000.;
|
||||
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
|
||||
System.out.println("BP"+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
|
||||
si = 0;
|
||||
iter++;
|
||||
}
|
||||
|
||||
offsets.clear();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
offsets.add(r.nextLong(0, fileSize - 256));
|
||||
}
|
||||
|
||||
long st = System.nanoTime();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
LinuxSystemCalls.readAt(fd, segments.get(i), offsets.get(i));
|
||||
}
|
||||
long et = System.nanoTime();
|
||||
|
||||
samples[si++] = et - st;
|
||||
}
|
||||
}
|
||||
finally {
|
||||
LinuxSystemCalls.closeFd(fd);
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void testDirect(int blockSize, int sz, int iters) {
|
||||
try {
|
||||
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
|
||||
int fileSizeBlocks = (int) ((Files.size(largeFile) & -blockSize) / blockSize);
|
||||
|
||||
Random r = new Random();
|
||||
List<MemorySegment> segments = new ArrayList<>();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
segments.add(Arena.ofAuto().allocate(blockSize, blockSize));
|
||||
}
|
||||
List<Long> offsets = new ArrayList<>();
|
||||
|
||||
long[] samples = new long[1000];
|
||||
int si = 0;
|
||||
|
||||
try (UringFileReader reader = new UringFileReader(largeFile, true)) {
|
||||
for (int iter = 0; iter < iters; ) {
|
||||
if (si == samples.length) {
|
||||
Arrays.sort(samples);
|
||||
double p1 = samples[10] / 1_000.;
|
||||
double p10 = samples[100] / 1_000.;
|
||||
double p90 = samples[900] / 1_000.;
|
||||
double p99 = samples[990] / 1_000.;
|
||||
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
|
||||
System.out.println("DN"+blockSize+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
|
||||
si = 0;
|
||||
iters++;
|
||||
}
|
||||
|
||||
offsets.clear();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
offsets.add(blockSize * r.nextLong(0, fileSizeBlocks));
|
||||
}
|
||||
|
||||
long st = System.nanoTime();
|
||||
reader.read(segments, offsets);
|
||||
long et = System.nanoTime();
|
||||
|
||||
samples[si++] = et - st;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void testDirect1(int blockSize, int iters) {
|
||||
try {
|
||||
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
|
||||
int fileSizeBlocks = (int) ((Files.size(largeFile) & -blockSize) / blockSize);
|
||||
|
||||
Random r = new Random();
|
||||
MemorySegment segment = Arena.global().allocate(blockSize, blockSize);
|
||||
|
||||
long[] samples = new long[1000];
|
||||
int si = 0;
|
||||
|
||||
int fd = LinuxSystemCalls.openDirect(largeFile);
|
||||
if (fd < 0) {
|
||||
throw new IOException("open failed");
|
||||
}
|
||||
try {
|
||||
for (int iter = 0; iter < iters; ) {
|
||||
if (si == samples.length) {
|
||||
Arrays.sort(samples);
|
||||
double p1 = samples[10] / 1_000.;
|
||||
double p10 = samples[100] / 1_000.;
|
||||
double p90 = samples[900] / 1_000.;
|
||||
double p99 = samples[990] / 1_000.;
|
||||
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
|
||||
System.out.println("D1"+blockSize+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
|
||||
si = 0;
|
||||
iters++;
|
||||
}
|
||||
|
||||
|
||||
long st = System.nanoTime();
|
||||
int ret;
|
||||
long readOffset = blockSize * r.nextLong(0, fileSizeBlocks);
|
||||
if (blockSize != (ret = LinuxSystemCalls.readAt(fd, segment, readOffset))) {
|
||||
throw new IOException("pread failed: " + ret);
|
||||
}
|
||||
long et = System.nanoTime();
|
||||
|
||||
samples[si++] = et - st;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
finally {
|
||||
LinuxSystemCalls.closeFd(fd);
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Thread.ofPlatform().start(() -> testBuffered(128, 32, 65536,1000));
|
||||
Thread.ofPlatform().start(() -> testDirect(8192*4, 128,1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBufferedPread(128, 1000));
|
||||
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024*1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024*1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 100));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 100));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 100));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 100));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(100));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(100));
|
||||
|
||||
for (;;);
|
||||
// testBuffered(100);
|
||||
}
|
||||
}
|
@@ -0,0 +1,313 @@
|
||||
package nu.marginalia.index.perftest;
|
||||
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.index.FullReverseIndexReader;
|
||||
import nu.marginalia.index.IndexQueryExecution;
|
||||
import nu.marginalia.index.PrioReverseIndexReader;
|
||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.results.DomainRankingOverrides;
|
||||
import nu.marginalia.index.results.IndexResultRankingService;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.searchset.SearchSetAny;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
public class PerfTestMain {
|
||||
static Duration warmupTime = Duration.ofMinutes(1);
|
||||
static Duration runTime = Duration.ofMinutes(10);
|
||||
|
||||
public static void main(String[] args) {
|
||||
if (args.length != 4) {
|
||||
System.err.println("Arguments: home-dir index-dir query");
|
||||
System.exit(255);
|
||||
}
|
||||
|
||||
try {
|
||||
Path indexDir = Paths.get(args[0]);
|
||||
if (!Files.isDirectory(indexDir)) {
|
||||
System.err.println("Index directory is not a directory");
|
||||
System.exit(255);
|
||||
}
|
||||
Path homeDir = Paths.get(args[1]);
|
||||
String scenario = args[2];
|
||||
String query = args[3];
|
||||
|
||||
switch (scenario) {
|
||||
case "valuation" -> runValuation(indexDir, homeDir, query);
|
||||
case "lookup" -> runLookup(indexDir, homeDir, query);
|
||||
case "execution" -> runExecution(indexDir, homeDir, query);
|
||||
}
|
||||
|
||||
System.exit(0);
|
||||
}
|
||||
catch (NumberFormatException e) {
|
||||
System.err.println("Arguments: data-dir index-dir query");
|
||||
System.exit(255);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
System.err.println("Error during testing");
|
||||
ex.printStackTrace();
|
||||
System.exit(255);
|
||||
}
|
||||
System.out.println(Arrays.toString(args));
|
||||
}
|
||||
|
||||
private static CombinedIndexReader createCombinedIndexReader(Path indexDir) throws IOException {
|
||||
|
||||
return new CombinedIndexReader(
|
||||
new ForwardIndexReader(
|
||||
indexDir.resolve("ir/fwd-doc-id.dat"),
|
||||
indexDir.resolve("ir/fwd-doc-data.dat"),
|
||||
indexDir.resolve("ir/fwd-spans.dat")
|
||||
),
|
||||
new FullReverseIndexReader(
|
||||
"full",
|
||||
indexDir.resolve("ir/rev-words.dat"),
|
||||
indexDir.resolve("ir/rev-docs.dat"),
|
||||
new PositionsFileReader(indexDir.resolve("ir/rev-positions.dat"))
|
||||
),
|
||||
new PrioReverseIndexReader(
|
||||
"prio",
|
||||
indexDir.resolve("ir/rev-prio-words.dat"),
|
||||
indexDir.resolve("ir/rev-prio-docs.dat")
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
private static IndexResultRankingService createIndexResultRankingService(Path indexDir, CombinedIndexReader combinedIndexReader) throws IOException, SQLException {
|
||||
return new IndexResultRankingService(
|
||||
new DocumentDbReader(indexDir.resolve("ldbr/documents.db")),
|
||||
new StatefulIndex(combinedIndexReader),
|
||||
new DomainRankingOverrides(null, Path.of("xxxx"))
|
||||
);
|
||||
}
|
||||
|
||||
static QueryFactory createQueryFactory(Path homeDir) throws IOException {
|
||||
return new QueryFactory(
|
||||
new QueryExpansion(
|
||||
new TermFrequencyDict(homeDir.resolve("model/tfreq-new-algo3.bin")),
|
||||
new NgramLexicon()
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public static void runValuation(Path homeDir,
|
||||
Path indexDir,
|
||||
String rawQuery) throws IOException, SQLException, TimeoutException {
|
||||
|
||||
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
|
||||
|
||||
var queryLimits = RpcQueryLimits.newBuilder()
|
||||
.setTimeoutMs(10_000)
|
||||
.setResultsTotal(1000)
|
||||
.setResultsByDomain(10)
|
||||
.setFetchSize(4096)
|
||||
.build();
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
|
||||
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||
|
||||
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||
|
||||
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams, new IndexSearchBudget(10_000));
|
||||
|
||||
TLongArrayList allResults = new TLongArrayList();
|
||||
LongQueryBuffer buffer = new LongQueryBuffer(512);
|
||||
|
||||
for (var query : queries) {
|
||||
while (query.hasMore() && allResults.size() < 512 ) {
|
||||
query.getMoreResults(buffer);
|
||||
allResults.addAll(buffer.copyData());
|
||||
}
|
||||
if (allResults.size() >= 512)
|
||||
break;
|
||||
}
|
||||
allResults.sort();
|
||||
if (allResults.size() > 512) {
|
||||
allResults.subList(512, allResults.size()).clear();
|
||||
}
|
||||
|
||||
var rankingContext = ResultRankingContext.create(indexReader, searchParameters);
|
||||
var rankingData = rankingService.prepareRankingData(rankingContext, new CombinedDocIdList(allResults.toArray()), null);
|
||||
|
||||
int sum = 0;
|
||||
|
||||
Instant runEndTime = Instant.now().plus(runTime);
|
||||
Instant runStartTime = Instant.now();
|
||||
int sum2 = 0;
|
||||
List<Double> times = new ArrayList<>();
|
||||
|
||||
int iter;
|
||||
for (iter = 0;; iter++) {
|
||||
IndexSearchBudget budget = new IndexSearchBudget(10000);
|
||||
long start = System.nanoTime();
|
||||
sum2 += rankingService.rankResults(budget, rankingContext, rankingData, false).size();
|
||||
long end = System.nanoTime();
|
||||
times.add((end - start)/1_000_000.);
|
||||
|
||||
if ((iter % 100) == 0) {
|
||||
if (Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
if (times.size() > 100) {
|
||||
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
|
||||
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
|
||||
}
|
||||
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + (allResults.size() / 512.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
}
|
||||
}
|
||||
System.out.println("Benchmark complete after " + iter + " iters!");
|
||||
|
||||
System.out.println("Best times: " + (allResults.size() / 512.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
System.out.println("Warmup sum: " + sum);
|
||||
System.out.println("Main sum: " + sum2);
|
||||
System.out.println(rankingData.size());
|
||||
}
|
||||
|
||||
public static void runExecution(Path homeDir,
|
||||
Path indexDir,
|
||||
String rawQuery) throws IOException, SQLException, InterruptedException {
|
||||
|
||||
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
|
||||
|
||||
var queryLimits = RpcQueryLimits.newBuilder()
|
||||
.setTimeoutMs(50)
|
||||
.setResultsTotal(1000)
|
||||
.setResultsByDomain(10)
|
||||
.setFetchSize(4096)
|
||||
.build();
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||
|
||||
System.out.println("Running warmup loop!");
|
||||
int sum = 0;
|
||||
|
||||
Instant runEndTime = Instant.now().plus(runTime);
|
||||
Instant runStartTime = Instant.now();
|
||||
int sum2 = 0;
|
||||
List<Double> rates = new ArrayList<>();
|
||||
List<Double> times = new ArrayList<>();
|
||||
int iter;
|
||||
for (iter = 0;; iter++) {
|
||||
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||
var execution = new IndexQueryExecution(searchParameters, rankingService, indexReader);
|
||||
long start = System.nanoTime();
|
||||
execution.run();
|
||||
long end = System.nanoTime();
|
||||
sum2 += execution.itemsProcessed();
|
||||
rates.add(execution.itemsProcessed() / ((end - start)/1_000_000_000.));
|
||||
times.add((end - start)/1_000_000.);
|
||||
indexReader.reset();
|
||||
if ((iter % 100) == 0) {
|
||||
if (Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
if (times.size() > 100) {
|
||||
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
|
||||
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
|
||||
}
|
||||
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best rates: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Benchmark complete after " + iter + " iters!");
|
||||
System.out.println("Best counts: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
|
||||
System.out.println("Warmup sum: " + sum);
|
||||
System.out.println("Main sum: " + sum2);
|
||||
}
|
||||
|
||||
public static void runLookup(Path homeDir,
|
||||
Path indexDir,
|
||||
String rawQuery) throws IOException, SQLException
|
||||
{
|
||||
|
||||
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||
|
||||
var queryLimits = RpcQueryLimits.newBuilder()
|
||||
.setTimeoutMs(10_000)
|
||||
.setResultsTotal(1000)
|
||||
.setResultsByDomain(10)
|
||||
.setFetchSize(4096)
|
||||
.build();
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
|
||||
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||
|
||||
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||
|
||||
|
||||
Instant runEndTime = Instant.now().plus(runTime);
|
||||
|
||||
LongQueryBuffer buffer = new LongQueryBuffer(512);
|
||||
int sum1 = 0;
|
||||
int iter;
|
||||
|
||||
Instant runStartTime = Instant.now();
|
||||
int sum2 = 0;
|
||||
List<Double> times = new ArrayList<>();
|
||||
for (iter = 0;; iter++) {
|
||||
indexReader.reset();
|
||||
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams, new IndexSearchBudget(150));
|
||||
|
||||
long start = System.nanoTime();
|
||||
for (var query : queries) {
|
||||
while (query.hasMore()) {
|
||||
query.getMoreResults(buffer);
|
||||
sum1 += buffer.end;
|
||||
buffer.reset();
|
||||
}
|
||||
}
|
||||
long end = System.nanoTime();
|
||||
times.add((end - start)/1_000_000_000.);
|
||||
|
||||
if ((iter % 10) == 0) {
|
||||
if (Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
if (times.size() > 100) {
|
||||
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
|
||||
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
|
||||
}
|
||||
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
}
|
||||
}
|
||||
System.out.println("Benchmark complete after " + iter + " iters!");
|
||||
System.out.println("Best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
System.out.println("Warmup sum: " + sum1);
|
||||
System.out.println("Main sum: " + sum2);
|
||||
}
|
||||
}
|
@@ -15,6 +15,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:native')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:random-write-funnel')
|
||||
|
@@ -1,32 +1,26 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
import nu.marginalia.skiplist.SkipListReader;
|
||||
|
||||
public class FullIndexEntrySource implements EntrySource {
|
||||
private final String name;
|
||||
private final BTreeReader reader;
|
||||
|
||||
int pos;
|
||||
int endOffset;
|
||||
|
||||
final int entrySize;
|
||||
private final SkipListReader reader;
|
||||
private final long wordId;
|
||||
|
||||
public FullIndexEntrySource(String name,
|
||||
BTreeReader reader,
|
||||
int entrySize,
|
||||
SkipListReader reader,
|
||||
long wordId) {
|
||||
this.name = name;
|
||||
this.reader = reader;
|
||||
this.entrySize = entrySize;
|
||||
this.wordId = wordId;
|
||||
|
||||
pos = 0;
|
||||
endOffset = pos + entrySize * reader.numEntries();
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -36,32 +30,14 @@ public class FullIndexEntrySource implements EntrySource {
|
||||
|
||||
@Override
|
||||
public void read(LongQueryBuffer buffer) {
|
||||
buffer.reset();
|
||||
buffer.end = min(buffer.end, endOffset - pos);
|
||||
reader.readData(buffer.data, buffer.end, pos);
|
||||
pos += buffer.end;
|
||||
|
||||
destagger(buffer);
|
||||
buffer.uniq();
|
||||
}
|
||||
|
||||
private void destagger(LongQueryBuffer buffer) {
|
||||
if (entrySize == 1)
|
||||
return;
|
||||
|
||||
for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
|
||||
buffer.data.set(wi, buffer.data.get(ri));
|
||||
}
|
||||
|
||||
buffer.end /= entrySize;
|
||||
reader.getData(buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasMore() {
|
||||
return pos < endOffset;
|
||||
return !reader.atEnd();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String indexName() {
|
||||
return name + ":" + Long.toHexString(wordId);
|
||||
|
@@ -2,16 +2,17 @@ package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.array.pool.BufferPool;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.ffi.LinuxSystemCalls;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import nu.marginalia.index.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
||||
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.query.*;
|
||||
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
|
||||
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.skiplist.SkipListConstants;
|
||||
import nu.marginalia.skiplist.SkipListReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -20,10 +21,12 @@ import java.lang.foreign.Arena;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class FullReverseIndexReader {
|
||||
private final LongArray words;
|
||||
private final LongArray documents;
|
||||
|
||||
private final long wordsDataOffset;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final BTreeReader wordsBTreeReader;
|
||||
@@ -31,6 +34,8 @@ public class FullReverseIndexReader {
|
||||
|
||||
private final PositionsFileReader positionsFileReader;
|
||||
|
||||
private final BufferPool dataPool;
|
||||
|
||||
public FullReverseIndexReader(String name,
|
||||
Path words,
|
||||
Path documents,
|
||||
@@ -44,6 +49,7 @@ public class FullReverseIndexReader {
|
||||
this.documents = null;
|
||||
this.wordsBTreeReader = null;
|
||||
this.wordsDataOffset = -1;
|
||||
this.dataPool = null;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -52,6 +58,11 @@ public class FullReverseIndexReader {
|
||||
this.words = LongArrayFactory.mmapForReadingShared(words);
|
||||
this.documents = LongArrayFactory.mmapForReadingShared(documents);
|
||||
|
||||
LinuxSystemCalls.madviseRandom(this.words.getMemorySegment());
|
||||
LinuxSystemCalls.madviseRandom(this.documents.getMemorySegment());
|
||||
|
||||
dataPool = new BufferPool(documents, SkipListConstants.BLOCK_SIZE, (int) (Long.getLong("index.bufferPoolSize", 512*1024*1024L) / SkipListConstants.BLOCK_SIZE));
|
||||
|
||||
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||
|
||||
@@ -62,6 +73,11 @@ public class FullReverseIndexReader {
|
||||
}
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
dataPool.reset();
|
||||
}
|
||||
|
||||
|
||||
private void selfTest() {
|
||||
logger.info("Running self test program");
|
||||
|
||||
@@ -76,6 +92,15 @@ public class FullReverseIndexReader {
|
||||
ReverseIndexSelfTest.runSelfTest6(wordsDataRange, documents);
|
||||
}
|
||||
|
||||
public void eachDocRange(Consumer<LongArray> eachDocRange) {
|
||||
long wordsDataSize = wordsBTreeReader.getHeader().numEntries() * 2L;
|
||||
var wordsDataRange = words.range(wordsDataOffset, wordsDataOffset + wordsDataSize);
|
||||
|
||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||
eachDocRange.accept(docsBTreeReader.data());
|
||||
}
|
||||
}
|
||||
|
||||
/** Calculate the offset of the word in the documents.
|
||||
* If the return-value is negative, the term does not exist
|
||||
@@ -101,27 +126,27 @@ public class FullReverseIndexReader {
|
||||
if (offset < 0) // No documents
|
||||
return new EmptyEntrySource();
|
||||
|
||||
return new FullIndexEntrySource(name, createReaderNew(offset), 2, termId);
|
||||
return new FullIndexEntrySource(name, getReader(offset), termId);
|
||||
}
|
||||
|
||||
/** Create a filter step requiring the specified termId to exist in the documents */
|
||||
public QueryFilterStepIf also(long termId) {
|
||||
public QueryFilterStepIf also(long termId, IndexSearchBudget budget) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return new QueryFilterNoPass();
|
||||
|
||||
return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId);
|
||||
return new ReverseIndexRetainFilter(getReader(offset), name, termId, budget);
|
||||
}
|
||||
|
||||
/** Create a filter step requiring the specified termId to be absent from the documents */
|
||||
public QueryFilterStepIf not(long termId) {
|
||||
public QueryFilterStepIf not(long termId, IndexSearchBudget budget) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return new QueryFilterLetThrough();
|
||||
|
||||
return new ReverseIndexRejectFilter(createReaderNew(offset));
|
||||
return new ReverseIndexRejectFilter(getReader(offset), budget);
|
||||
}
|
||||
|
||||
/** Return the number of documents with the termId in the index */
|
||||
@@ -131,15 +156,39 @@ public class FullReverseIndexReader {
|
||||
if (offset < 0)
|
||||
return 0;
|
||||
|
||||
return createReaderNew(offset).numEntries();
|
||||
return getReader(offset).estimateSize();
|
||||
}
|
||||
|
||||
/** Create a BTreeReader for the document offset associated with a termId */
|
||||
private BTreeReader createReaderNew(long offset) {
|
||||
return new BTreeReader(
|
||||
documents,
|
||||
ReverseIndexParameters.fullDocsBTreeContext,
|
||||
offset);
|
||||
private SkipListReader getReader(long offset) {
|
||||
return new SkipListReader(dataPool, offset);
|
||||
}
|
||||
|
||||
public TermData[] getTermData(Arena arena,
|
||||
long[] termIds,
|
||||
long[] docIds)
|
||||
{
|
||||
|
||||
long[] offsetsAll = new long[termIds.length * docIds.length];
|
||||
|
||||
for (int i = 0; i < termIds.length; i++) {
|
||||
long termId = termIds[i];
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) {
|
||||
// This is likely a bug in the code, but we can't throw an exception here
|
||||
logger.debug("Missing offset for word {}", termId);
|
||||
continue;
|
||||
}
|
||||
|
||||
var reader = getReader(offset);
|
||||
|
||||
// Read the size and offset of the position data
|
||||
var offsetsForTerm = reader.getValueOffsets(docIds);
|
||||
System.arraycopy(offsetsForTerm, 0, offsetsAll, i * docIds.length, docIds.length);
|
||||
}
|
||||
|
||||
return positionsFileReader.getTermData(arena, offsetsAll);
|
||||
}
|
||||
|
||||
public TermData[] getTermData(Arena arena,
|
||||
@@ -156,20 +205,22 @@ public class FullReverseIndexReader {
|
||||
return ret;
|
||||
}
|
||||
|
||||
var reader = createReaderNew(offset);
|
||||
var reader = getReader(offset);
|
||||
|
||||
// Read the size and offset of the position data
|
||||
var offsets = reader.queryData(docIds, 1);
|
||||
var offsets = reader.getValueOffsets(docIds);
|
||||
|
||||
for (int i = 0; i < docIds.length; i++) {
|
||||
if (offsets[i] == 0)
|
||||
continue;
|
||||
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
|
||||
}
|
||||
return ret;
|
||||
return positionsFileReader.getTermData(arena, offsets);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
dataPool.close();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Error while closing bufferPool", e);
|
||||
}
|
||||
|
||||
if (documents != null)
|
||||
documents.close();
|
||||
|
||||
|
@@ -13,7 +13,7 @@ import java.nio.channels.FileChannel;
|
||||
public class PrioIndexEntrySource implements EntrySource {
|
||||
private final String name;
|
||||
|
||||
private final ByteBuffer readData = ByteBuffer.allocate(1024);
|
||||
private final ByteBuffer readData = ByteBuffer.allocate(8*1024);
|
||||
private final BitReader bitReader = new BitReader(readData, this::fillReadBuffer);
|
||||
|
||||
private final FileChannel docsFileChannel;
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.index;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.ffi.LinuxSystemCalls;
|
||||
import nu.marginalia.index.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import org.slf4j.Logger;
|
||||
@@ -40,6 +41,8 @@ public class PrioReverseIndexReader {
|
||||
|
||||
this.words = LongArrayFactory.mmapForReadingShared(words);
|
||||
|
||||
LinuxSystemCalls.madviseRandom(this.words.getMemorySegment());
|
||||
|
||||
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||
|
||||
|
@@ -5,7 +5,7 @@ import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
public class ReverseIndexParameters
|
||||
{
|
||||
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_2048);
|
||||
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_512);
|
||||
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
|
||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
|
||||
}
|
||||
|
@@ -14,62 +14,103 @@ import java.nio.file.StandardOpenOption;
|
||||
*
|
||||
* The positions data is concatenated in the file, with each term's metadata
|
||||
* followed by its positions. The metadata is a single byte, and the positions
|
||||
* are encoded using the Elias Gamma code, with zero padded bits at the end to
|
||||
* get octet alignment.
|
||||
*
|
||||
* are encoded varints.
|
||||
* <p></p>
|
||||
*
|
||||
* It is the responsibility of the caller to keep track of the byte offset of
|
||||
* each posting in the file.
|
||||
*/
|
||||
public class PositionsFileConstructor implements AutoCloseable {
|
||||
private final ByteBuffer workBuffer = ByteBuffer.allocate(65536);
|
||||
|
||||
private final Path file;
|
||||
private final FileChannel channel;
|
||||
|
||||
private long offset;
|
||||
|
||||
public PositionsFileConstructor(Path file) throws IOException {
|
||||
this.file = file;
|
||||
|
||||
channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
|
||||
}
|
||||
|
||||
/** Represents a block of positions lists. Each writer thread should hold on to
|
||||
* a block object to ensure the locality of its positions lists.
|
||||
* When finished, commit() must be run.
|
||||
* */
|
||||
public class PositionsFileBlock {
|
||||
private final ByteBuffer workBuffer = ByteBuffer.allocate(1024*1024*16);
|
||||
private long position;
|
||||
|
||||
public PositionsFileBlock(long position) {
|
||||
this.position = position;
|
||||
}
|
||||
|
||||
public boolean fitsData(int size) {
|
||||
return workBuffer.remaining() >= size;
|
||||
}
|
||||
|
||||
public void commit() throws IOException {
|
||||
workBuffer.position(0);
|
||||
workBuffer.limit(workBuffer.capacity());
|
||||
int pos = 0;
|
||||
while (workBuffer.hasRemaining()) {
|
||||
pos += channel.write(workBuffer, this.position + pos + workBuffer.position());
|
||||
}
|
||||
}
|
||||
|
||||
private void relocate() throws IOException {
|
||||
workBuffer.clear();
|
||||
position = channel.position();
|
||||
while (workBuffer.hasRemaining()) {
|
||||
channel.write(workBuffer);
|
||||
}
|
||||
workBuffer.clear();
|
||||
}
|
||||
|
||||
public long position() {
|
||||
return this.position + workBuffer.position();
|
||||
}
|
||||
public void put(byte b) {
|
||||
workBuffer.put(b);
|
||||
}
|
||||
public void put(ByteBuffer buffer) {
|
||||
workBuffer.put(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
public PositionsFileBlock getBlock() throws IOException {
|
||||
synchronized (this) {
|
||||
var block = new PositionsFileBlock(channel.position());
|
||||
block.relocate();
|
||||
return block;
|
||||
}
|
||||
}
|
||||
|
||||
/** Add a term to the positions file
|
||||
*
|
||||
* @param block a block token to ensure data locality
|
||||
* @param termMeta the term metadata
|
||||
* @param positionsBuffer the positions of the term
|
||||
*
|
||||
* @return the offset of the term in the file, with the size of the data in the highest byte
|
||||
*/
|
||||
public long add(byte termMeta, ByteBuffer positionsBuffer) throws IOException {
|
||||
synchronized (file) {
|
||||
int size = 1 + positionsBuffer.remaining();
|
||||
public long add(PositionsFileBlock block, byte termMeta, ByteBuffer positionsBuffer) throws IOException {
|
||||
int size = 1 + positionsBuffer.remaining();
|
||||
|
||||
if (workBuffer.remaining() < size) {
|
||||
workBuffer.flip();
|
||||
channel.write(workBuffer);
|
||||
workBuffer.clear();
|
||||
if (!block.fitsData(size)) {
|
||||
synchronized (this) {
|
||||
block.commit();
|
||||
block.relocate();
|
||||
}
|
||||
}
|
||||
synchronized (file) {
|
||||
long offset = block.position();
|
||||
|
||||
workBuffer.put(termMeta);
|
||||
workBuffer.put(positionsBuffer);
|
||||
block.put(termMeta);
|
||||
block.put(positionsBuffer);
|
||||
|
||||
long ret = PositionCodec.encode(size, offset);
|
||||
|
||||
offset += size;
|
||||
|
||||
return ret;
|
||||
return PositionCodec.encode(size, offset);
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
if (workBuffer.hasRemaining()) {
|
||||
workBuffer.flip();
|
||||
|
||||
while (workBuffer.hasRemaining())
|
||||
channel.write(workBuffer);
|
||||
}
|
||||
|
||||
channel.force(false);
|
||||
channel.close();
|
||||
}
|
||||
|
@@ -1,46 +0,0 @@
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||
import nu.marginalia.btree.BTreeWriter;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Constructs the BTrees in a reverse index */
|
||||
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||
private final BTreeWriter writer;
|
||||
private final int entrySize;
|
||||
private final LongArray documentsArray;
|
||||
|
||||
long start = 0;
|
||||
long writeOffset = 0;
|
||||
|
||||
public FullIndexBTreeTransformer(LongArray urlsFileMap,
|
||||
int entrySize,
|
||||
BTreeContext bTreeContext,
|
||||
LongArray documentsArray) {
|
||||
this.documentsArray = documentsArray;
|
||||
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
||||
this.entrySize = entrySize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long transform(long pos, long end) throws IOException {
|
||||
|
||||
final int size = (int) ((end - start) / entrySize);
|
||||
|
||||
if (size == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final long offsetForBlock = writeOffset;
|
||||
|
||||
writeOffset += writer.write(writeOffset, size,
|
||||
mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start)
|
||||
);
|
||||
|
||||
start = end;
|
||||
return offsetForBlock;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||
import nu.marginalia.skiplist.SkipListWriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** Constructs the BTrees in a reverse index */
|
||||
public class FullIndexSkipListTransformer implements LongArrayTransformations.LongIOTransformer, AutoCloseable {
|
||||
private final SkipListWriter writer;
|
||||
private final LongArray documentsArray;
|
||||
|
||||
long start = 0;
|
||||
|
||||
public FullIndexSkipListTransformer(Path docsOutputFile,
|
||||
LongArray documentsArray) throws IOException {
|
||||
this.documentsArray = documentsArray;
|
||||
this.writer = new SkipListWriter(docsOutputFile);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long transform(long pos, long end) throws IOException {
|
||||
|
||||
final int size = (int) ((end - start) / 2);
|
||||
|
||||
if (size == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
long offset = writer.writeList(documentsArray, start, size);
|
||||
start = end;
|
||||
return offset;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
}
|
@@ -6,7 +6,6 @@ import nu.marginalia.btree.BTreeWriter;
|
||||
import nu.marginalia.index.ReverseIndexParameters;
|
||||
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.IndexSizeEstimator;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
import org.slf4j.Logger;
|
||||
@@ -81,15 +80,11 @@ public class FullPreindex {
|
||||
|
||||
// Estimate the size of the docs index data
|
||||
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
|
||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.fullDocsBTreeContext, 2);
|
||||
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
||||
|
||||
// Write the docs file
|
||||
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
||||
offsets.transformEachIO(0, offsets.size(),
|
||||
new FullIndexBTreeTransformer(finalDocs, 2,
|
||||
ReverseIndexParameters.fullDocsBTreeContext,
|
||||
documents.documents));
|
||||
try (var transformer = new FullIndexSkipListTransformer(outputFileDocs, documents.documents)) {
|
||||
offsets.transformEachIO(0, offsets.size(), transformer);
|
||||
}
|
||||
|
||||
LongArray wordIds = segments.wordIds;
|
||||
|
||||
@@ -102,7 +97,7 @@ public class FullPreindex {
|
||||
// Estimate the size of the words index data
|
||||
long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
|
||||
|
||||
// Construct the tree
|
||||
// Construct the keywords tree
|
||||
LongArray wordsArray = LongArrayFactory.mmapForWritingConfined(outputFileWords, wordsSize);
|
||||
|
||||
new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
|
||||
@@ -113,8 +108,6 @@ public class FullPreindex {
|
||||
}
|
||||
});
|
||||
|
||||
finalDocs.force();
|
||||
finalDocs.close();
|
||||
wordsArray.force();
|
||||
wordsArray.close();
|
||||
|
||||
|
@@ -12,10 +12,8 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.List;
|
||||
|
||||
/** A LongArray with document data, segmented according to
|
||||
@@ -52,11 +50,6 @@ public class FullPreindexDocuments {
|
||||
return new FullPreindexDocuments(docsFileMap, docsFile);
|
||||
}
|
||||
|
||||
public FileChannel createDocumentsFileChannel() throws IOException {
|
||||
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
|
||||
}
|
||||
|
||||
|
||||
public LongArray slice(long start, long end) {
|
||||
return documents.range(start, end);
|
||||
}
|
||||
@@ -86,6 +79,8 @@ public class FullPreindexDocuments {
|
||||
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||
offsetMap.defaultReturnValue(0);
|
||||
|
||||
var positionsBlock = positionsFileConstructor.getBlock();
|
||||
|
||||
while (docIds.hasRemaining()) {
|
||||
long docId = docIds.get();
|
||||
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
|
||||
@@ -101,12 +96,13 @@ public class FullPreindexDocuments {
|
||||
ByteBuffer pos = tPos.get(i);
|
||||
|
||||
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
||||
long encodedPosOffset = positionsFileConstructor.add(meta, pos);
|
||||
long encodedPosOffset = positionsFileConstructor.add(positionsBlock, meta, pos);
|
||||
|
||||
assembly.put(offset + 0, rankEncodedId);
|
||||
assembly.put(offset + 1, encodedPosOffset);
|
||||
}
|
||||
}
|
||||
positionsBlock.commit();
|
||||
|
||||
assembly.write(docsFile);
|
||||
}
|
||||
|
@@ -1,43 +1,78 @@
|
||||
package nu.marginalia.index.positions;
|
||||
|
||||
import nu.marginalia.uring.UringFileReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.List;
|
||||
|
||||
/** Reads positions data from the positions file */
|
||||
public class PositionsFileReader implements AutoCloseable {
|
||||
private final FileChannel positions;
|
||||
|
||||
private final UringFileReader uringFileReader;
|
||||
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
|
||||
|
||||
public PositionsFileReader(Path positionsFile) throws IOException {
|
||||
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
|
||||
}
|
||||
if (Boolean.getBoolean("index.directModePositionsFile")) {
|
||||
if ((Files.size(positionsFile) & 4095) != 0) {
|
||||
throw new IllegalArgumentException("Positions file is not block aligned in size: " + Files.size(positionsFile));
|
||||
}
|
||||
|
||||
/** Get the positions for a term in the index, as pointed out by the encoded offset;
|
||||
* intermediate buffers are allocated from the provided arena allocator. */
|
||||
public TermData getTermData(Arena arena, long sizeEncodedOffset) {
|
||||
int length = PositionCodec.decodeSize(sizeEncodedOffset);
|
||||
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
|
||||
|
||||
var segment = arena.allocate(length);
|
||||
var buffer = segment.asByteBuffer();
|
||||
|
||||
try {
|
||||
positions.read(buffer, offset);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
uringFileReader = new UringFileReader(positionsFile, true);
|
||||
}
|
||||
else {
|
||||
uringFileReader = new UringFileReader(positionsFile, false);
|
||||
}
|
||||
|
||||
return new TermData(buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
positions.close();
|
||||
uringFileReader.close();
|
||||
}
|
||||
|
||||
/** Get the positions for a keywords in the index, as pointed out by the encoded offsets;
|
||||
* intermediate buffers are allocated from the provided arena allocator. */
|
||||
public TermData[] getTermData(Arena arena, long[] offsets) {
|
||||
|
||||
int cnt = 0;
|
||||
|
||||
for (int i = 0; i < offsets.length; i++) {
|
||||
long encodedOffset = offsets[i];
|
||||
if (encodedOffset == 0) continue;
|
||||
cnt++;
|
||||
}
|
||||
|
||||
if (cnt == 0) {
|
||||
return new TermData[offsets.length];
|
||||
}
|
||||
|
||||
long[] readOffsets = new long[cnt];
|
||||
int[] readSizes = new int[cnt];
|
||||
|
||||
for (int i = 0, j = 0; i < offsets.length; i++) {
|
||||
long encodedOffset = offsets[i];
|
||||
if (encodedOffset == 0) continue;
|
||||
|
||||
readSizes[j] = PositionCodec.decodeSize(encodedOffset);
|
||||
readOffsets[j] = PositionCodec.decodeOffset(encodedOffset);
|
||||
j++;
|
||||
}
|
||||
|
||||
List<MemorySegment> buffers = uringFileReader.readUnaligned(arena, readOffsets, readSizes, 4096);
|
||||
|
||||
TermData[] ret = new TermData[offsets.length];
|
||||
for (int i = 0, j=0; i < offsets.length; i++) {
|
||||
long encodedOffset = offsets[i];
|
||||
if (encodedOffset == 0) continue;
|
||||
ret[i] = new TermData(buffers.get(j++).asByteBuffer());
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,24 +1,22 @@
|
||||
package nu.marginalia.index.query;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.skiplist.SkipListReader;
|
||||
|
||||
public record ReverseIndexRejectFilter(BTreeReader range) implements QueryFilterStepIf {
|
||||
public record ReverseIndexRejectFilter(SkipListReader range, IndexSearchBudget budget) implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
range.rejectEntries(buffer);
|
||||
while (budget.hasTimeLeft() && range.tryRejectData(buffer));
|
||||
|
||||
buffer.finalizeFiltering();
|
||||
}
|
||||
|
||||
public boolean test(long id) {
|
||||
return range.findEntry(id) < 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double cost() {
|
||||
return range.numEntries();
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -1,24 +1,21 @@
|
||||
package nu.marginalia.index.query;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.skiplist.SkipListReader;
|
||||
|
||||
public record ReverseIndexRetainFilter(BTreeReader range, String name, long wordId) implements QueryFilterStepIf {
|
||||
public record ReverseIndexRetainFilter(SkipListReader range, String name, long wordId, IndexSearchBudget budget) implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
range.retainEntries(buffer);
|
||||
buffer.finalizeFiltering();
|
||||
}
|
||||
while (budget.hasTimeLeft() && range.tryRetainData(buffer));
|
||||
|
||||
public boolean test(long id) {
|
||||
return range.findEntry(id) >= 0;
|
||||
buffer.finalizeFiltering();
|
||||
}
|
||||
|
||||
@Override
|
||||
public double cost() {
|
||||
return range.numEntries();
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -11,7 +11,6 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
@@ -32,32 +31,32 @@ class PositionsFileReaderTest {
|
||||
|
||||
@Test
|
||||
void getTermData() throws IOException {
|
||||
ByteBuffer workArea = ByteBuffer.allocate(8192);
|
||||
long key1, key2, key3;
|
||||
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
|
||||
key1 = constructor.add((byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
|
||||
key2 = constructor.add((byte) 51, VarintCodedSequence.generate(2, 3, 5, 1000, 5000, 20241).buffer());
|
||||
key3 = constructor.add((byte) 61, VarintCodedSequence.generate(3, 5, 7).buffer());
|
||||
var block = constructor.getBlock();
|
||||
key1 = constructor.add(block, (byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
|
||||
key2 = constructor.add(block, (byte) 51, VarintCodedSequence.generate(2, 3, 5, 1000, 5000, 20241).buffer());
|
||||
key3 = constructor.add(block, (byte) 61, VarintCodedSequence.generate(3, 5, 7).buffer());
|
||||
block.commit();
|
||||
}
|
||||
|
||||
System.out.println("key1: " + Long.toHexString(key1));
|
||||
System.out.println("key2: " + Long.toHexString(key2));
|
||||
System.out.println("key3: " + Long.toHexString(key3));
|
||||
|
||||
try (Arena arena = Arena.ofConfined();
|
||||
try (Arena arena = Arena.ofShared();
|
||||
PositionsFileReader reader = new PositionsFileReader(file))
|
||||
{
|
||||
TermData data1 = reader.getTermData(arena, key1);
|
||||
assertEquals(43, data1.flags());
|
||||
assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
|
||||
TermData[] data = reader.getTermData(arena, new long[] { key1, key2, key3 });
|
||||
|
||||
TermData data2 = reader.getTermData(arena, key2);
|
||||
assertEquals(51, data2.flags());
|
||||
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
|
||||
assertEquals(43, data[0].flags());
|
||||
assertEquals(IntList.of( 1, 2, 3), data[0].positions().values());
|
||||
|
||||
TermData data3 = reader.getTermData(arena, key3);
|
||||
assertEquals(61, data3.flags());
|
||||
assertEquals(IntList.of(3, 5, 7), data3.positions().values());
|
||||
assertEquals(51, data[1].flags());
|
||||
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data[1].positions().values());
|
||||
|
||||
assertEquals(61, data[2].flags());
|
||||
assertEquals(IntList.of(3, 5, 7), data[2].positions().values());
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,149 +0,0 @@
|
||||
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.model.BTreeHeader;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class FullPreindexFinalizeTest {
|
||||
TestJournalFactory journalFactory;
|
||||
Path positionsFile;
|
||||
Path countsFile;
|
||||
Path wordsIdFile;
|
||||
Path docsFile;
|
||||
Path tempDir;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
journalFactory = new TestJournalFactory();
|
||||
|
||||
positionsFile = Files.createTempFile("positions", ".dat");
|
||||
countsFile = Files.createTempFile("counts", ".dat");
|
||||
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||
docsFile = Files.createTempFile("docs", ".dat");
|
||||
tempDir = Files.createTempDirectory("sort");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
journalFactory.clear();
|
||||
|
||||
Files.deleteIfExists(countsFile);
|
||||
Files.deleteIfExists(wordsIdFile);
|
||||
List<Path> contents = new ArrayList<>();
|
||||
Files.list(tempDir).forEach(contents::add);
|
||||
for (var tempFile : contents) {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
Files.delete(tempDir);
|
||||
}
|
||||
|
||||
MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
long termId(String keyword) {
|
||||
return hash.hashKeyword(keyword);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFinalizeSimple() throws IOException {
|
||||
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
||||
var preindex = FullPreindex.constructPreindex(reader,
|
||||
new PositionsFileConstructor(positionsFile),
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
|
||||
|
||||
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||
preindex.delete();
|
||||
|
||||
Path wordsFile = tempDir.resolve("words.dat");
|
||||
Path docsFile = tempDir.resolve("docs.dat");
|
||||
|
||||
assertTrue(Files.exists(wordsFile));
|
||||
assertTrue(Files.exists(docsFile));
|
||||
|
||||
System.out.println(Files.size(wordsFile));
|
||||
System.out.println(Files.size(docsFile));
|
||||
|
||||
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
|
||||
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
|
||||
|
||||
var docsHeader = new BTreeHeader(docsArray, 0);
|
||||
var wordsHeader = new BTreeHeader(wordsArray, 0);
|
||||
|
||||
assertEquals(1, docsHeader.numEntries());
|
||||
assertEquals(1, wordsHeader.numEntries());
|
||||
|
||||
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFinalizeSimple2x2() throws IOException {
|
||||
var reader = journalFactory.createReader(
|
||||
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
|
||||
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
||||
);
|
||||
|
||||
var preindex = FullPreindex.constructPreindex(reader,
|
||||
new PositionsFileConstructor(positionsFile),
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
|
||||
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||
preindex.delete();
|
||||
|
||||
Path wordsFile = tempDir.resolve("words.dat");
|
||||
Path docsFile = tempDir.resolve("docs.dat");
|
||||
|
||||
assertTrue(Files.exists(wordsFile));
|
||||
assertTrue(Files.exists(docsFile));
|
||||
|
||||
System.out.println(Files.size(wordsFile));
|
||||
System.out.println(Files.size(docsFile));
|
||||
|
||||
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
|
||||
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
|
||||
|
||||
|
||||
var wordsHeader = new BTreeHeader(wordsArray, 0);
|
||||
|
||||
System.out.println(wordsHeader);
|
||||
|
||||
assertEquals(2, wordsHeader.numEntries());
|
||||
|
||||
long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1);
|
||||
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
|
||||
|
||||
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
|
||||
BTreeHeader docsHeader;
|
||||
|
||||
docsHeader = new BTreeHeader(docsArray, offset1);
|
||||
System.out.println(docsHeader);
|
||||
assertEquals(1, docsHeader.numEntries());
|
||||
|
||||
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
|
||||
docsHeader = new BTreeHeader(docsArray, offset2);
|
||||
System.out.println(docsHeader);
|
||||
assertEquals(1, docsHeader.numEntries());
|
||||
|
||||
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
}
|
||||
}
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import nu.marginalia.ranking.domains.jgrapht.PersonalizedPageRank;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.domainranking.data.GraphSource;
|
||||
import nu.marginalia.domainranking.jgrapht.PersonalizedPageRank;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
|
||||
import org.jgrapht.alg.scoring.PageRank;
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
package nu.marginalia.domainranking;
|
||||
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultAccumulator;
|
||||
|
||||
import java.util.function.Supplier;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
public interface RankingResultAccumulator<T> {
|
||||
void add(int domainId, int rank);
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.accumulator;
|
||||
package nu.marginalia.domainranking.accumulator;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import org.jgrapht.Graph;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import org.jgrapht.Graph;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.data;
|
||||
package nu.marginalia.domainranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.ranking.domains.jgrapht;
|
||||
package nu.marginalia.domainranking.jgrapht;
|
||||
|
||||
/*
|
||||
* (C) Copyright 2016-2023, by Dimitrios Michail and Contributors.
|
||||
@@ -21,8 +21,9 @@ package nu.marginalia.ranking.domains.jgrapht;
|
||||
|
||||
/* (modified by @vlofgren to add personalization) */
|
||||
|
||||
import org.jgrapht.*;
|
||||
import org.jgrapht.alg.interfaces.*;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.Graphs;
|
||||
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
|
||||
|
||||
import java.util.*;
|
||||
|
@@ -2,28 +2,18 @@ package nu.marginalia.index;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Gauge;
|
||||
import io.prometheus.client.Histogram;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.results.IndexResultRankingService;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import nu.marginalia.index.searchset.SearchSetsService;
|
||||
import nu.marginalia.index.searchset.SmallSearchSet;
|
||||
@@ -34,14 +24,7 @@ import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
@Singleton
|
||||
public class IndexGrpcService
|
||||
@@ -87,23 +70,22 @@ public class IndexGrpcService
|
||||
private final StatefulIndex statefulIndex;
|
||||
private final SearchSetsService searchSetsService;
|
||||
|
||||
private final IndexResultRankingService resultValuator;
|
||||
private final IndexResultRankingService rankingService;
|
||||
|
||||
private final String nodeName;
|
||||
|
||||
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
|
||||
|
||||
@Inject
|
||||
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
|
||||
StatefulIndex statefulIndex,
|
||||
SearchSetsService searchSetsService,
|
||||
IndexResultRankingService resultValuator)
|
||||
IndexResultRankingService rankingService)
|
||||
{
|
||||
var nodeId = serviceConfiguration.node();
|
||||
this.nodeName = Integer.toString(nodeId);
|
||||
this.statefulIndex = statefulIndex;
|
||||
this.searchSetsService = searchSetsService;
|
||||
this.resultValuator = resultValuator;
|
||||
this.rankingService = rankingService;
|
||||
}
|
||||
|
||||
// GRPC endpoint
|
||||
@@ -120,7 +102,13 @@ public class IndexGrpcService
|
||||
.time(() -> {
|
||||
// Perform the search
|
||||
try {
|
||||
return executeSearch(params);
|
||||
|
||||
if (!statefulIndex.isLoaded()) {
|
||||
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
|
||||
return List.of();
|
||||
}
|
||||
|
||||
return new IndexQueryExecution(params, rankingService, statefulIndex.get()).run();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
@@ -148,7 +136,7 @@ public class IndexGrpcService
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
responseObserver.onError(ex);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(ex).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -156,7 +144,12 @@ public class IndexGrpcService
|
||||
// exists for test access
|
||||
public List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
|
||||
try {
|
||||
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
|
||||
if (!statefulIndex.isLoaded()) {
|
||||
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
|
||||
return List.of();
|
||||
}
|
||||
|
||||
return new IndexQueryExecution(new SearchParameters(specsSet, getSearchSet(specsSet)), rankingService, statefulIndex.get()).run();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
@@ -182,262 +175,6 @@ public class IndexGrpcService
|
||||
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
|
||||
}
|
||||
|
||||
// accessible for tests
|
||||
public List<RpcDecoratedResultItem> executeSearch(SearchParameters params) throws Exception {
|
||||
|
||||
if (!statefulIndex.isLoaded()) {
|
||||
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
|
||||
return List.of();
|
||||
}
|
||||
|
||||
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
|
||||
params.compiledQuery,
|
||||
params.compiledQueryIds);
|
||||
|
||||
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
|
||||
|
||||
List<RpcDecoratedResultItem> ret = queryExecution.run(params);
|
||||
|
||||
wmsa_index_query_exec_block_time
|
||||
.labels(nodeName)
|
||||
.set(queryExecution.getBlockTime() / 1000.);
|
||||
wmsa_index_query_exec_stall_time
|
||||
.labels(nodeName)
|
||||
.set(queryExecution.getStallTime() / 1000.);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** This class is responsible for ranking the results and adding the best results to the
|
||||
* resultHeap, which depending on the state of the indexLookup threads may or may not block
|
||||
*/
|
||||
private ResultRankingContext createRankingContext(RpcResultRankingParameters rankingParams,
|
||||
CompiledQuery<String> compiledQuery,
|
||||
CompiledQueryLong compiledQueryIds)
|
||||
{
|
||||
|
||||
int[] full = new int[compiledQueryIds.size()];
|
||||
int[] prio = new int[compiledQueryIds.size()];
|
||||
|
||||
BitSet ngramsMask = new BitSet(compiledQuery.size());
|
||||
BitSet regularMask = new BitSet(compiledQuery.size());
|
||||
|
||||
var currentIndex = statefulIndex.get();
|
||||
|
||||
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
|
||||
long id = compiledQueryIds.at(idx);
|
||||
full[idx] = currentIndex.numHits(id);
|
||||
prio[idx] = currentIndex.numHitsPrio(id);
|
||||
|
||||
if (compiledQuery.at(idx).contains("_")) {
|
||||
ngramsMask.set(idx);
|
||||
}
|
||||
else {
|
||||
regularMask.set(idx);
|
||||
}
|
||||
}
|
||||
|
||||
return new ResultRankingContext(currentIndex.totalDocCount(),
|
||||
rankingParams,
|
||||
ngramsMask,
|
||||
regularMask,
|
||||
new CqDataInt(full),
|
||||
new CqDataInt(prio));
|
||||
}
|
||||
|
||||
/** This class is responsible for executing a search query. It uses a thread pool to
|
||||
* execute the subqueries and their valuation in parallel. The results are then combined
|
||||
* into a bounded priority queue, and finally the best results are returned.
|
||||
*/
|
||||
private class QueryExecution {
|
||||
|
||||
private static final Executor workerPool = Executors.newCachedThreadPool();
|
||||
|
||||
/** The queue where the results from the index lookup threads are placed,
|
||||
* pending ranking by the result ranker threads */
|
||||
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
|
||||
= new ArrayBlockingQueue<>(64);
|
||||
private final ResultPriorityQueue resultHeap;
|
||||
|
||||
private final ResultRankingContext resultRankingContext;
|
||||
private final AtomicInteger remainingIndexTasks = new AtomicInteger(0);
|
||||
|
||||
private final AtomicInteger remainingValuationTasks = new AtomicInteger(0);
|
||||
private final AtomicLong blockTime = new AtomicLong(0);
|
||||
|
||||
private final AtomicLong stallTime = new AtomicLong(0);
|
||||
|
||||
public long getStallTime() {
|
||||
return stallTime.get();
|
||||
}
|
||||
|
||||
public long getBlockTime() {
|
||||
return blockTime.get();
|
||||
}
|
||||
|
||||
private QueryExecution(ResultRankingContext resultRankingContext, int maxResults) {
|
||||
this.resultRankingContext = resultRankingContext;
|
||||
this.resultHeap = new ResultPriorityQueue(maxResults);
|
||||
}
|
||||
|
||||
/** Execute a search query */
|
||||
public List<RpcDecoratedResultItem> run(SearchParameters parameters) throws Exception {
|
||||
|
||||
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
|
||||
|
||||
var currentIndex = statefulIndex.get();
|
||||
for (var indexQuery : currentIndex.createQueries(terms, parameters.queryParams)) {
|
||||
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
|
||||
}
|
||||
|
||||
for (int i = 0; i < indexValuationThreads; i++) {
|
||||
workerPool.execute(new ResultRanker(parameters, resultRankingContext));
|
||||
}
|
||||
|
||||
// Wait for all tasks to complete
|
||||
awaitCompletion();
|
||||
|
||||
// Return the best results
|
||||
return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap);
|
||||
}
|
||||
|
||||
/** Wait for all tasks to complete */
|
||||
private void awaitCompletion() throws InterruptedException {
|
||||
synchronized (remainingValuationTasks) {
|
||||
while (remainingValuationTasks.get() > 0) {
|
||||
remainingValuationTasks.wait(20);
|
||||
}
|
||||
}
|
||||
}
|
||||
/** This class is responsible for executing a subquery and adding the results to the
|
||||
* resultCandidateQueue, which depending on the state of the valuator threads may
|
||||
* or may not block */
|
||||
class IndexLookup implements Runnable {
|
||||
private final IndexQuery query;
|
||||
|
||||
private final IndexSearchBudget budget;
|
||||
|
||||
IndexLookup(IndexQuery query,
|
||||
IndexSearchBudget budget) {
|
||||
this.query = query;
|
||||
this.budget = budget;
|
||||
|
||||
remainingIndexTasks.incrementAndGet();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
executeSearch();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in index lookup", ex);
|
||||
}
|
||||
finally {
|
||||
synchronized (remainingIndexTasks) {
|
||||
if (remainingIndexTasks.decrementAndGet() == 0) {
|
||||
remainingIndexTasks.notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void executeSearch() {
|
||||
final LongArrayList results = new LongArrayList(16);
|
||||
|
||||
// These queries are different indices for one subquery
|
||||
final LongQueryBuffer buffer = new LongQueryBuffer(4096);
|
||||
|
||||
while (query.hasMore() && budget.hasTimeLeft())
|
||||
{
|
||||
buffer.reset();
|
||||
query.getMoreResults(buffer);
|
||||
|
||||
for (int i = 0; i < buffer.end; i+=16) {
|
||||
for (int j = 0; j < Math.min(buffer.end - i, 16); j++) {
|
||||
results.add(buffer.data.get(i+j));
|
||||
}
|
||||
enqueueResults(new CombinedDocIdList(results));
|
||||
results.clear();
|
||||
}
|
||||
}
|
||||
|
||||
buffer.dispose();
|
||||
}
|
||||
|
||||
private void enqueueResults(CombinedDocIdList resultIds) {
|
||||
long remainingTime = budget.timeLeft();
|
||||
|
||||
try {
|
||||
if (!resultCandidateQueue.offer(resultIds)) {
|
||||
long start = System.currentTimeMillis();
|
||||
resultCandidateQueue.offer(resultIds, remainingTime, TimeUnit.MILLISECONDS);
|
||||
blockTime.addAndGet(System.currentTimeMillis() - start);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
logger.warn("Interrupted while waiting to offer resultIds to queue", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
class ResultRanker implements Runnable {
|
||||
private final SearchParameters parameters;
|
||||
|
||||
private final ResultRankingContext rankingContext;
|
||||
|
||||
ResultRanker(SearchParameters parameters, ResultRankingContext rankingContext) {
|
||||
this.parameters = parameters;
|
||||
this.rankingContext = rankingContext;
|
||||
|
||||
remainingValuationTasks.incrementAndGet();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
while (parameters.budget.timeLeft() > 0 && execute());
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
logger.warn("Interrupted while waiting to poll resultIds from queue", e);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Exception while ranking results", e);
|
||||
}
|
||||
finally {
|
||||
synchronized (remainingValuationTasks) {
|
||||
if (remainingValuationTasks.decrementAndGet() == 0)
|
||||
remainingValuationTasks.notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean execute() throws Exception {
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
// Do a relatively short poll to ensure we terminate in a timely manner
|
||||
// in the event all work is done
|
||||
final long pollTime = Math.clamp(parameters.budget.timeLeft(), 1, 5);
|
||||
CombinedDocIdList resultIds = resultCandidateQueue.poll(pollTime, TimeUnit.MILLISECONDS);
|
||||
|
||||
if (resultIds == null) {
|
||||
// check if we are done and can terminate
|
||||
if (remainingIndexTasks.get() == 0 && resultCandidateQueue.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
stallTime.addAndGet(System.currentTimeMillis() - start);
|
||||
|
||||
resultHeap.addAll(
|
||||
resultValuator.rankResults(parameters, false, rankingContext, resultIds)
|
||||
);
|
||||
}
|
||||
|
||||
return true; // keep going
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
228
code/index/java/nu/marginalia/index/IndexQueryExecution.java
Normal file
228
code/index/java/nu/marginalia/index/IndexQueryExecution.java
Normal file
@@ -0,0 +1,228 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.results.IndexResultRankingService;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.skiplist.SkipListConstants;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/** Performs an index query */
|
||||
public class IndexQueryExecution {
|
||||
|
||||
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
|
||||
private static final int indexPreparationThreads = Integer.getInteger("index.preparationThreads", 4);
|
||||
|
||||
// Since most NVMe drives have a maximum read size of 128 KB, and most small reads are 512B
|
||||
// this should probably be 128*1024 / 512 = 256 to reduce queue depth and optimize tail latency
|
||||
private static final int evaluationBatchSize = 256;
|
||||
|
||||
// This should probably be SkipListConstants.BLOCK_SIZE / 16 in order to reduce the number of unnecessary read
|
||||
// operations per lookup and again optimize tail latency
|
||||
private static final int lookupBatchSize = SkipListConstants.BLOCK_SIZE / 16;
|
||||
|
||||
private static final AtomicLong lookupTime = new AtomicLong();
|
||||
private static final AtomicLong prepTime = new AtomicLong();
|
||||
private static final AtomicLong valuationTime = new AtomicLong();
|
||||
|
||||
private static final ExecutorService threadPool = new ThreadPoolExecutor(indexValuationThreads, Integer.MAX_VALUE, 60L, TimeUnit.SECONDS, new SynchronousQueue<>());
|
||||
private static final Logger log = LoggerFactory.getLogger(IndexQueryExecution.class);
|
||||
|
||||
private final IndexResultRankingService rankingService;
|
||||
|
||||
private final ResultRankingContext rankingContext;
|
||||
private final List<IndexQuery> queries;
|
||||
private final IndexSearchBudget budget;
|
||||
private final ResultPriorityQueue resultHeap;
|
||||
private final CountDownLatch lookupCountdown;
|
||||
private final CountDownLatch preparationCountdown;
|
||||
private final CountDownLatch rankingCountdown;
|
||||
|
||||
private final ArrayBlockingQueue<CombinedDocIdList> fullPreparationQueue = new ArrayBlockingQueue<>(8, true);
|
||||
private final ArrayBlockingQueue<CombinedDocIdList> priorityPreparationQueue = new ArrayBlockingQueue<>(8, true);
|
||||
private final ArrayBlockingQueue<IndexResultRankingService.RankingData> fullEvaluationQueue = new ArrayBlockingQueue<>(8, true);
|
||||
private final ArrayBlockingQueue<IndexResultRankingService.RankingData> priorityEvaluationQueue = new ArrayBlockingQueue<>(8, true);
|
||||
|
||||
private final int limitTotal;
|
||||
private final int limitByDomain;
|
||||
|
||||
static {
|
||||
Thread.ofPlatform().daemon().start(() -> {
|
||||
for (;;) {
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
break;
|
||||
}
|
||||
log.info("Lookup: {}, Valuation: {}, Prep Time: {}", lookupTime.get() / 1_000_000_000., valuationTime.get() / 1_000_000_000., prepTime.get() / 1_000_000_000.);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public IndexQueryExecution(SearchParameters params,
|
||||
IndexResultRankingService rankingService,
|
||||
CombinedIndexReader currentIndex) {
|
||||
this.rankingService = rankingService;
|
||||
|
||||
resultHeap = new ResultPriorityQueue(params.fetchSize);
|
||||
|
||||
budget = params.budget;
|
||||
limitByDomain = params.limitByDomain;
|
||||
limitTotal = params.limitTotal;
|
||||
|
||||
rankingContext = ResultRankingContext.create(currentIndex, params);
|
||||
queries = currentIndex.createQueries(new SearchTerms(params.query, params.compiledQueryIds), params.queryParams, budget);
|
||||
|
||||
lookupCountdown = new CountDownLatch(queries.size());
|
||||
preparationCountdown = new CountDownLatch(indexPreparationThreads * 2);
|
||||
rankingCountdown = new CountDownLatch(indexValuationThreads * 2);
|
||||
}
|
||||
|
||||
public List<RpcDecoratedResultItem> run() throws InterruptedException, SQLException {
|
||||
for (IndexQuery query : queries) {
|
||||
threadPool.submit(() -> lookup(query));
|
||||
}
|
||||
|
||||
for (int i = 0; i < indexPreparationThreads; i++) {
|
||||
threadPool.submit(() -> prepare(priorityPreparationQueue, priorityEvaluationQueue));
|
||||
threadPool.submit(() -> prepare(fullPreparationQueue, fullEvaluationQueue));
|
||||
}
|
||||
|
||||
// Spawn lookup tasks for each query
|
||||
for (int i = 0; i < indexValuationThreads; i++) {
|
||||
threadPool.submit(() -> evaluate(priorityEvaluationQueue));
|
||||
threadPool.submit(() -> evaluate(fullEvaluationQueue));
|
||||
}
|
||||
|
||||
// Await lookup task termination
|
||||
lookupCountdown.await();
|
||||
preparationCountdown.await();
|
||||
rankingCountdown.await();
|
||||
|
||||
// Deallocate any leftover ranking data buffers
|
||||
for (var data : priorityEvaluationQueue) {
|
||||
data.close();
|
||||
}
|
||||
for (var data : fullEvaluationQueue) {
|
||||
data.close();
|
||||
}
|
||||
|
||||
// Final result selection
|
||||
return rankingService.selectBestResults(limitByDomain, limitTotal, rankingContext, resultHeap.toList());
|
||||
}
|
||||
|
||||
private List<Future<?>> lookup(IndexQuery query) {
|
||||
final LongQueryBuffer buffer = new LongQueryBuffer(lookupBatchSize);
|
||||
List<Future<?>> evaluationJobs = new ArrayList<>();
|
||||
try {
|
||||
while (query.hasMore() && budget.hasTimeLeft()) {
|
||||
|
||||
buffer.zero();
|
||||
|
||||
long st = System.nanoTime();
|
||||
query.getMoreResults(buffer);
|
||||
long et = System.nanoTime();
|
||||
lookupTime.addAndGet(et - st);
|
||||
|
||||
if (buffer.isEmpty())
|
||||
continue;
|
||||
|
||||
var queue = query.isPrioritized() ? priorityPreparationQueue : fullPreparationQueue;
|
||||
|
||||
if (buffer.end <= evaluationBatchSize) {
|
||||
var docIds = new CombinedDocIdList(buffer);
|
||||
|
||||
if (!queue.offer(docIds, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
|
||||
break;
|
||||
}
|
||||
else {
|
||||
long[] bufferData = buffer.copyData();
|
||||
for (int start = 0; start < bufferData.length; start+= evaluationBatchSize) {
|
||||
|
||||
long[] slice = Arrays.copyOfRange(bufferData, start,
|
||||
Math.min(start + evaluationBatchSize, bufferData.length));
|
||||
|
||||
var docIds = new CombinedDocIdList(slice);
|
||||
|
||||
if (!queue.offer(docIds, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (RuntimeException | InterruptedException ex) {
|
||||
log.error("Exception in lookup thread", ex);
|
||||
} finally {
|
||||
buffer.dispose();
|
||||
lookupCountdown.countDown();
|
||||
}
|
||||
|
||||
return evaluationJobs;
|
||||
}
|
||||
|
||||
private void prepare(ArrayBlockingQueue<CombinedDocIdList> inputQueue, ArrayBlockingQueue<IndexResultRankingService.RankingData> outputQueue) {
|
||||
try {
|
||||
while (budget.hasTimeLeft() && (lookupCountdown.getCount() > 0 || !inputQueue.isEmpty())) {
|
||||
var docIds = inputQueue.poll(Math.clamp(budget.timeLeft(), 1, 5), TimeUnit.MILLISECONDS);
|
||||
if (docIds == null) continue;
|
||||
long st = System.nanoTime();
|
||||
var preparedData = rankingService.prepareRankingData(rankingContext, docIds, budget);
|
||||
long et = System.nanoTime();
|
||||
prepTime.addAndGet(et - st);
|
||||
if (!outputQueue.offer(preparedData, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
|
||||
preparedData.close();
|
||||
}
|
||||
} catch (TimeoutException ex) {
|
||||
// This is normal
|
||||
} catch (Exception ex) {
|
||||
if (!(ex.getCause() instanceof InterruptedException)) {
|
||||
log.error("Exception in lookup thread", ex);
|
||||
} // suppress logging for interrupted ex
|
||||
} finally {
|
||||
preparationCountdown.countDown();
|
||||
}
|
||||
}
|
||||
|
||||
private void evaluate(ArrayBlockingQueue<IndexResultRankingService.RankingData> queue) {
|
||||
try {
|
||||
while (budget.hasTimeLeft() && (preparationCountdown.getCount() > 0 || !queue.isEmpty())) {
|
||||
var rankingData = queue.poll(Math.clamp(budget.timeLeft(), 1, 5), TimeUnit.MILLISECONDS);
|
||||
if (rankingData == null) continue;
|
||||
|
||||
try (rankingData) {
|
||||
long st = System.nanoTime();
|
||||
resultHeap.addAll(rankingService.rankResults(budget, rankingContext, rankingData, false));
|
||||
long et = System.nanoTime();
|
||||
valuationTime.addAndGet(et - st);
|
||||
}
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
if (!(ex.getCause() instanceof InterruptedException)) {
|
||||
log.error("Exception in lookup thread", ex);
|
||||
} // suppress logging for interrupted ex
|
||||
} finally {
|
||||
rankingCountdown.countDown();
|
||||
}
|
||||
}
|
||||
|
||||
public int itemsProcessed() {
|
||||
return resultHeap.getItemsProcessed();
|
||||
}
|
||||
|
||||
}
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import com.google.common.collect.MinMaxPriorityQueue;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@@ -9,108 +10,52 @@ import java.util.*;
|
||||
/** A priority queue for search results. This class is not thread-safe,
|
||||
* in general, except for concurrent use of the addAll method.
|
||||
* <p></p>
|
||||
* The class implements a subset of the Collection interface, and
|
||||
* is intended to be used as a priority queue for search results,
|
||||
* with a maximum size.
|
||||
* <p></p>
|
||||
* Since the expected use case is to add a large number of items
|
||||
* and then iterate over the items, the class is optimized for
|
||||
* this scenario, and does not implement other mutating methods
|
||||
* than addAll().
|
||||
*/
|
||||
public class ResultPriorityQueue implements Iterable<SearchResultItem>,
|
||||
Collection<SearchResultItem> {
|
||||
private final int limit;
|
||||
private final ArrayList<SearchResultItem> backingList = new ArrayList<>();
|
||||
public class ResultPriorityQueue implements Iterable<SearchResultItem> {
|
||||
private final LongOpenHashSet idsInSet = new LongOpenHashSet();
|
||||
private final MinMaxPriorityQueue<SearchResultItem> queue;
|
||||
|
||||
private int itemsProcessed = 0;
|
||||
|
||||
public ResultPriorityQueue(int limit) {
|
||||
this.limit = limit;
|
||||
this.queue = MinMaxPriorityQueue.<SearchResultItem>orderedBy(Comparator.naturalOrder()).maximumSize(limit).create();
|
||||
}
|
||||
|
||||
public Iterator<SearchResultItem> iterator() {
|
||||
return backingList.iterator();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Object[] toArray() {
|
||||
return backingList.toArray();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public <T> T[] toArray(@NotNull T[] a) {
|
||||
return backingList.toArray(a);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(SearchResultItem searchResultItem) {
|
||||
throw new UnsupportedOperationException("Use addAll instead");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean remove(Object o) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean containsAll(@NotNull Collection<?> c) {
|
||||
return idsInSet.containsAll(c);
|
||||
return queue.iterator();
|
||||
}
|
||||
|
||||
/** Adds all items to the queue, and returns true if any items were added.
|
||||
* This is a thread-safe operation.
|
||||
*/
|
||||
@Override
|
||||
public synchronized boolean addAll(@NotNull Collection<? extends SearchResultItem> items) {
|
||||
boolean itemsAdded = false;
|
||||
for (var item: items) {
|
||||
if (idsInSet.add(item.getDocumentId())) {
|
||||
backingList.add(item);
|
||||
itemsAdded = true;
|
||||
}
|
||||
}
|
||||
if (!itemsAdded) {
|
||||
return false;
|
||||
}
|
||||
itemsProcessed+=items.size();
|
||||
|
||||
backingList.sort(Comparator.naturalOrder());
|
||||
if (backingList.size() > limit) {
|
||||
backingList.subList(limit, backingList.size()).clear();
|
||||
for (var item : items) {
|
||||
if (idsInSet.add(item.getDocumentId())) {
|
||||
queue.add(item);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean removeAll(@NotNull Collection<?> c) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retainAll(@NotNull Collection<?> c) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
backingList.clear();
|
||||
idsInSet.clear();
|
||||
public synchronized List<SearchResultItem> toList() {
|
||||
return new ArrayList<>(queue);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return backingList.size();
|
||||
return queue.size();
|
||||
}
|
||||
public int getItemsProcessed() {
|
||||
return itemsProcessed;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return backingList.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(Object o) {
|
||||
return backingList.contains(o);
|
||||
return queue.isEmpty();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,14 +5,17 @@ import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.FullReverseIndexReader;
|
||||
import nu.marginalia.index.PrioReverseIndexReader;
|
||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
@@ -25,6 +28,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -55,20 +59,19 @@ public class CombinedIndexReader {
|
||||
return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf hasWordFull(long termId) {
|
||||
return reverseIndexFullReader.also(termId);
|
||||
public QueryFilterStepIf hasWordFull(long termId, IndexSearchBudget budget) {
|
||||
return reverseIndexFullReader.also(termId, budget);
|
||||
}
|
||||
|
||||
/** Creates a query builder for terms in the priority index */
|
||||
public IndexQueryBuilder findPriorityWord(long wordId) {
|
||||
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
|
||||
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId), true))
|
||||
.withSourceTerms(wordId);
|
||||
}
|
||||
|
||||
/** Creates a query builder for terms in the full index */
|
||||
public IndexQueryBuilder findFullWord(long wordId) {
|
||||
return newQueryBuilder(
|
||||
new IndexQuery(reverseIndexFullReader.documents(wordId)))
|
||||
return newQueryBuilder(new IndexQuery(reverseIndexFullReader.documents(wordId), false))
|
||||
.withSourceTerms(wordId);
|
||||
}
|
||||
|
||||
@@ -82,7 +85,12 @@ public class CombinedIndexReader {
|
||||
return reverseIndexFullReader.numDocuments(word);
|
||||
}
|
||||
|
||||
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
|
||||
/** Reset caches and buffers */
|
||||
public void reset() {
|
||||
reverseIndexFullReader.reset();
|
||||
}
|
||||
|
||||
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params, IndexSearchBudget budget) {
|
||||
|
||||
if (!isLoaded()) {
|
||||
logger.warn("Index reader not ready");
|
||||
@@ -123,7 +131,7 @@ public class CombinedIndexReader {
|
||||
continue;
|
||||
}
|
||||
|
||||
head.addInclusionFilter(hasWordFull(termId));
|
||||
head.addInclusionFilter(hasWordFull(termId, budget));
|
||||
}
|
||||
queryHeads.add(head);
|
||||
}
|
||||
@@ -132,7 +140,7 @@ public class CombinedIndexReader {
|
||||
if (paths.size() < 4) {
|
||||
var prioHead = findPriorityWord(elements.getLong(0));
|
||||
for (int i = 1; i < elements.size(); i++) {
|
||||
prioHead.addInclusionFilter(hasWordFull(elements.getLong(i)));
|
||||
prioHead.addInclusionFilter(hasWordFull(elements.getLong(i), budget));
|
||||
}
|
||||
queryHeads.add(prioHead);
|
||||
}
|
||||
@@ -143,11 +151,11 @@ public class CombinedIndexReader {
|
||||
|
||||
// Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
|
||||
for (long term : terms.advice()) {
|
||||
query = query.also(term);
|
||||
query = query.also(term, budget);
|
||||
}
|
||||
|
||||
for (long term : terms.excludes()) {
|
||||
query = query.not(term);
|
||||
query = query.not(term, budget);
|
||||
}
|
||||
|
||||
// Run these filter steps last, as they'll worst-case cause as many page faults as there are
|
||||
@@ -178,6 +186,18 @@ public class CombinedIndexReader {
|
||||
}
|
||||
|
||||
/** Retrieves the term metadata for the specified word for the provided documents */
|
||||
public TermMetadataList[] getTermMetadata(Arena arena,
|
||||
long[] wordIds,
|
||||
CombinedDocIdList docIds)
|
||||
{
|
||||
TermData[] combinedTermData = reverseIndexFullReader.getTermData(arena, wordIds, docIds.array());
|
||||
TermMetadataList[] ret = new TermMetadataList[wordIds.length];
|
||||
for (int i = 0; i < wordIds.length; i++) {
|
||||
ret[i] = new TermMetadataList(Arrays.copyOfRange(combinedTermData, i*docIds.size(), (i+1)*docIds.size()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public TermMetadataList getTermMetadata(Arena arena,
|
||||
long wordId,
|
||||
CombinedDocIdList docIds)
|
||||
@@ -205,14 +225,19 @@ public class CombinedIndexReader {
|
||||
return forwardIndexReader.getDocumentSize(docId);
|
||||
}
|
||||
|
||||
/** Retrieves the document spans for the specified document */
|
||||
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||
return forwardIndexReader.getDocumentSpans(arena, docId);
|
||||
/** Retrieves the document spans for the specified documents */
|
||||
public DocumentSpans[] getDocumentSpans(Arena arena, CombinedDocIdList docIds) {
|
||||
long[] decodedIDs = docIds.array();
|
||||
for (int i = 0; i < decodedIDs.length; i++) {
|
||||
decodedIDs[i] = UrlIdCodec.removeRank(decodedIDs[i]);
|
||||
}
|
||||
|
||||
return forwardIndexReader.getDocumentSpans(arena, decodedIDs);
|
||||
}
|
||||
|
||||
/** Close the indexes (this is not done immediately)
|
||||
* */
|
||||
public void close() throws InterruptedException {
|
||||
public void close() {
|
||||
/* Delay the invocation of close method to allow for a clean shutdown of the service.
|
||||
*
|
||||
* This is especially important when using Unsafe-based LongArrays, since we have
|
||||
@@ -227,7 +252,7 @@ public class CombinedIndexReader {
|
||||
}
|
||||
|
||||
|
||||
private void delayedCall(Runnable call, Duration delay) throws InterruptedException {
|
||||
private void delayedCall(Runnable call, Duration delay) {
|
||||
Thread.ofPlatform().start(() -> {
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(delay.toSeconds());
|
||||
@@ -248,25 +273,47 @@ public class CombinedIndexReader {
|
||||
class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
private final QueryParams params;
|
||||
private final ForwardIndexReader forwardIndexReader;
|
||||
|
||||
private final boolean imposesMetaConstraint;
|
||||
public ParamMatchingQueryFilter(QueryParams params,
|
||||
ForwardIndexReader forwardIndexReader)
|
||||
{
|
||||
this.params = params;
|
||||
this.forwardIndexReader = forwardIndexReader;
|
||||
this.imposesMetaConstraint = params.imposesDomainMetadataConstraint();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
if (!imposesMetaConstraint && !params.searchSet().imposesConstraint()) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (buffer.hasMore()) {
|
||||
if (test(buffer.currentValue())) {
|
||||
buffer.retainAndAdvance();
|
||||
}
|
||||
else {
|
||||
buffer.rejectAndAdvance();
|
||||
}
|
||||
}
|
||||
|
||||
buffer.finalizeFiltering();
|
||||
}
|
||||
|
||||
public boolean test(long combinedId) {
|
||||
long docId = UrlIdCodec.removeRank(combinedId);
|
||||
int domainId = UrlIdCodec.getDomainId(docId);
|
||||
|
||||
long meta = forwardIndexReader.getDocMeta(docId);
|
||||
|
||||
if (!validateDomain(domainId, meta)) {
|
||||
if (!validateDomain(domainId)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!imposesMetaConstraint) {
|
||||
return true;
|
||||
}
|
||||
|
||||
long meta = forwardIndexReader.getDocMeta(docId);
|
||||
|
||||
if (!validateQuality(meta)) {
|
||||
return false;
|
||||
}
|
||||
@@ -286,8 +333,8 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean validateDomain(int domainId, long meta) {
|
||||
return params.searchSet().contains(domainId, meta);
|
||||
private boolean validateDomain(int domainId) {
|
||||
return params.searchSet().contains(domainId);
|
||||
}
|
||||
|
||||
private boolean validateQuality(long meta) {
|
||||
@@ -338,4 +385,5 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
public String describe() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
|
||||
}
|
@@ -1,11 +1,10 @@
|
||||
package nu.marginalia.index.index;
|
||||
|
||||
import java.util.List;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.index.FullReverseIndexReader;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
@@ -32,18 +31,18 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder also(long termId) {
|
||||
public IndexQueryBuilder also(long termId, IndexSearchBudget budget) {
|
||||
|
||||
if (alreadyConsideredTerms.add(termId)) {
|
||||
query.addInclusionFilter(reverseIndexFullReader.also(termId));
|
||||
query.addInclusionFilter(reverseIndexFullReader.also(termId, budget));
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder not(long termId) {
|
||||
public IndexQueryBuilder not(long termId, IndexSearchBudget budget) {
|
||||
|
||||
query.addInclusionFilter(reverseIndexFullReader.not(termId));
|
||||
query.addInclusionFilter(reverseIndexFullReader.not(termId, budget));
|
||||
|
||||
return this;
|
||||
}
|
||||
@@ -55,20 +54,6 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterSteps) {
|
||||
if (filterSteps.isEmpty())
|
||||
return this;
|
||||
|
||||
if (filterSteps.size() == 1) {
|
||||
query.addInclusionFilter(filterSteps.getFirst());
|
||||
}
|
||||
else {
|
||||
query.addInclusionFilter(new QueryFilterAnyOf(filterSteps));
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQuery build() {
|
||||
return query;
|
||||
}
|
||||
|
@@ -35,6 +35,13 @@ public class StatefulIndex {
|
||||
this.eventLog = eventLog;
|
||||
}
|
||||
|
||||
/** For use in testing only */
|
||||
public StatefulIndex(CombinedIndexReader combinedIndexReader) {
|
||||
this.combinedIndexReader = combinedIndexReader;
|
||||
this.servicesFactory = null;
|
||||
this.eventLog = null;
|
||||
}
|
||||
|
||||
public void init() {
|
||||
Lock lock = indexReplacementLock.writeLock();
|
||||
|
||||
|
@@ -1,8 +1,9 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
@@ -41,6 +42,13 @@ public final class QueryParams {
|
||||
this.queryStrategy = queryStrategy;
|
||||
}
|
||||
|
||||
public boolean imposesDomainMetadataConstraint() {
|
||||
return qualityLimit.type() != SpecificationLimitType.NONE
|
||||
|| year.type() != SpecificationLimitType.NONE
|
||||
|| size.type() != SpecificationLimitType.NONE
|
||||
|| rank.type() != SpecificationLimitType.NONE;
|
||||
}
|
||||
|
||||
public SpecificationLimit qualityLimit() {
|
||||
return qualityLimit;
|
||||
}
|
||||
|
@@ -0,0 +1,106 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
public class ResultRankingContext {
|
||||
private final int docCount;
|
||||
public final RpcResultRankingParameters params;
|
||||
public final SearchQuery searchQuery;
|
||||
public final QueryParams queryParams;
|
||||
|
||||
public final CompiledQuery<String> compiledQuery;
|
||||
public final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
public final BitSet regularMask;
|
||||
public final BitSet ngramsMask;
|
||||
|
||||
/** CqDataInt associated with frequency information of the terms in the query
|
||||
* in the full index. The dataset is indexed by the compiled query. */
|
||||
public final CqDataInt fullCounts;
|
||||
|
||||
/** CqDataInt associated with frequency information of the terms in the query
|
||||
* in the full index. The dataset is indexed by the compiled query. */
|
||||
public final CqDataInt priorityCounts;
|
||||
|
||||
public static ResultRankingContext create(CombinedIndexReader currentIndex, SearchParameters searchParameters) {
|
||||
|
||||
var compiledQueryIds = searchParameters.compiledQueryIds;
|
||||
var compiledQuery = searchParameters.compiledQuery;
|
||||
|
||||
int[] full = new int[compiledQueryIds.size()];
|
||||
int[] prio = new int[compiledQueryIds.size()];
|
||||
|
||||
BitSet ngramsMask = new BitSet(compiledQuery.size());
|
||||
BitSet regularMask = new BitSet(compiledQuery.size());
|
||||
|
||||
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
|
||||
long id = compiledQueryIds.at(idx);
|
||||
full[idx] = currentIndex.numHits(id);
|
||||
prio[idx] = currentIndex.numHitsPrio(id);
|
||||
|
||||
if (compiledQuery.at(idx).contains("_")) {
|
||||
ngramsMask.set(idx);
|
||||
}
|
||||
else {
|
||||
regularMask.set(idx);
|
||||
}
|
||||
}
|
||||
|
||||
return new ResultRankingContext(currentIndex.totalDocCount(),
|
||||
searchParameters,
|
||||
compiledQuery,
|
||||
compiledQueryIds,
|
||||
ngramsMask,
|
||||
regularMask,
|
||||
new CqDataInt(full),
|
||||
new CqDataInt(prio));
|
||||
}
|
||||
|
||||
public ResultRankingContext(int docCount,
|
||||
SearchParameters searchParameters,
|
||||
CompiledQuery<String> compiledQuery,
|
||||
CompiledQueryLong compiledQueryIds,
|
||||
BitSet ngramsMask,
|
||||
BitSet regularMask,
|
||||
CqDataInt fullCounts,
|
||||
CqDataInt prioCounts)
|
||||
{
|
||||
this.docCount = docCount;
|
||||
|
||||
this.searchQuery = searchParameters.query;
|
||||
this.params = searchParameters.rankingParams;
|
||||
this.queryParams = searchParameters.queryParams;
|
||||
|
||||
this.compiledQuery = compiledQuery;
|
||||
this.compiledQueryIds = compiledQueryIds;
|
||||
|
||||
this.ngramsMask = ngramsMask;
|
||||
this.regularMask = regularMask;
|
||||
|
||||
this.fullCounts = fullCounts;
|
||||
this.priorityCounts = prioCounts;
|
||||
}
|
||||
|
||||
public int termFreqDocCount() {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ResultRankingContext{" +
|
||||
"docCount=" + docCount +
|
||||
", params=" + params +
|
||||
", regularMask=" + regularMask +
|
||||
", ngramsMask=" + ngramsMask +
|
||||
", fullCounts=" + fullCounts +
|
||||
", priorityCounts=" + priorityCounts +
|
||||
'}';
|
||||
}
|
||||
}
|
@@ -43,7 +43,7 @@ public class SearchParameters {
|
||||
var limits = specsSet.queryLimits;
|
||||
|
||||
this.fetchSize = limits.getFetchSize();
|
||||
this.budget = new IndexSearchBudget(limits.getTimeoutMs());
|
||||
this.budget = new IndexSearchBudget(Math.max(limits.getTimeoutMs()/2, limits.getTimeoutMs()-50));
|
||||
this.query = specsSet.query;
|
||||
this.limitByDomain = limits.getResultsByDomain();
|
||||
this.limitTotal = limits.getResultsTotal();
|
||||
@@ -67,9 +67,7 @@ public class SearchParameters {
|
||||
|
||||
this.fetchSize = limits.getFetchSize();
|
||||
|
||||
// The time budget is halved because this is the point when we start to
|
||||
// wrap up the search and return the results.
|
||||
this.budget = new IndexSearchBudget(limits.getTimeoutMs() / 2);
|
||||
this.budget = new IndexSearchBudget(Math.max(limits.getTimeoutMs()/2, limits.getTimeoutMs()-50));
|
||||
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
|
||||
|
||||
this.limitByDomain = limits.getResultsByDomain();
|
||||
|
@@ -2,7 +2,7 @@ package nu.marginalia.index.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
|
@@ -12,13 +12,14 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
@@ -30,9 +31,15 @@ import nu.marginalia.sequence.CodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
@Singleton
|
||||
public class IndexResultRankingService {
|
||||
@@ -52,96 +59,142 @@ public class IndexResultRankingService {
|
||||
this.domainRankingOverrides = domainRankingOverrides;
|
||||
}
|
||||
|
||||
public List<SearchResultItem> rankResults(SearchParameters params,
|
||||
boolean exportDebugData,
|
||||
ResultRankingContext rankingContext,
|
||||
CombinedDocIdList resultIds)
|
||||
{
|
||||
if (resultIds.isEmpty())
|
||||
return List.of();
|
||||
public RankingData prepareRankingData(ResultRankingContext rankingContext, CombinedDocIdList resultIds, @Nullable IndexSearchBudget budget) throws TimeoutException {
|
||||
return new RankingData(rankingContext, resultIds, budget);
|
||||
}
|
||||
|
||||
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext, params);
|
||||
public final class RankingData implements AutoCloseable {
|
||||
final Arena arena;
|
||||
|
||||
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
||||
private final TermMetadataList[] termsForDocs;
|
||||
private final DocumentSpans[] documentSpans;
|
||||
private final long[] flags;
|
||||
private final CodedSequence[] positions;
|
||||
private final CombinedDocIdList resultIds;
|
||||
private final QuerySearchTerms searchTerms;
|
||||
private AtomicBoolean closed = new AtomicBoolean(false);
|
||||
int pos = -1;
|
||||
|
||||
// Get the current index reader, which is the one we'll use for this calculation,
|
||||
// this may change during the calculation, but we don't want to switch over mid-calculation
|
||||
final CombinedIndexReader currentIndex = statefulIndex.get();
|
||||
public RankingData(ResultRankingContext rankingContext, CombinedDocIdList resultIds, @Nullable IndexSearchBudget budget) throws TimeoutException {
|
||||
this.resultIds = resultIds;
|
||||
this.arena = Arena.ofShared();
|
||||
|
||||
final QuerySearchTerms searchTerms = getSearchTerms(params.compiledQuery, params.query);
|
||||
final int termCount = searchTerms.termIdsAll.size();
|
||||
this.searchTerms = getSearchTerms(rankingContext.compiledQuery, rankingContext.searchQuery);
|
||||
final int termCount = searchTerms.termIdsAll.size();
|
||||
|
||||
// We use an arena for the position data to avoid gc pressure
|
||||
// from the gamma coded sequences, which can be large and have a lifetime
|
||||
// that matches the try block here
|
||||
try (var arena = Arena.ofConfined()) {
|
||||
this.flags = new long[termCount];
|
||||
this.positions = new CodedSequence[termCount];
|
||||
|
||||
TermMetadataList[] termsForDocs = new TermMetadataList[termCount];
|
||||
for (int ti = 0; ti < termCount; ti++) {
|
||||
termsForDocs[ti] = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.at(ti), resultIds);
|
||||
}
|
||||
// Get the current index reader, which is the one we'll use for this calculation,
|
||||
// this may change during the calculation, but we don't want to switch over mid-calculation
|
||||
|
||||
// Data for the document. We arrange this in arrays outside the calculation function to avoid
|
||||
// hash lookups in the inner loop, as it's hot code, and we don't want unnecessary cpu cache
|
||||
// thrashing in there; out here we can rely on implicit array ordering to match up the data.
|
||||
final CombinedIndexReader currentIndex = statefulIndex.get();
|
||||
|
||||
long[] flags = new long[termCount];
|
||||
CodedSequence[] positions = new CodedSequence[termCount];
|
||||
// Perform expensive I/O operations
|
||||
|
||||
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
||||
// term data arrays as well
|
||||
this.termsForDocs = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.array, resultIds);
|
||||
if (!budget.hasTimeLeft())
|
||||
throw new TimeoutException();
|
||||
this.documentSpans = currentIndex.getDocumentSpans(arena, resultIds);
|
||||
}
|
||||
|
||||
for (int i = 0; i < resultIds.size(); i++) {
|
||||
public CodedSequence[] positions() {
|
||||
return positions;
|
||||
}
|
||||
public long[] flags() {
|
||||
return flags;
|
||||
}
|
||||
public long resultId() {
|
||||
return resultIds.at(pos);
|
||||
}
|
||||
public DocumentSpans documentSpans() {
|
||||
return documentSpans[pos];
|
||||
}
|
||||
|
||||
// Prepare term-level data for the document
|
||||
public boolean next() {
|
||||
if (++pos < resultIds.size()) {
|
||||
for (int ti = 0; ti < flags.length; ti++) {
|
||||
var tfd = termsForDocs[ti];
|
||||
|
||||
assert tfd != null : "No term data for term " + ti;
|
||||
|
||||
flags[ti] = tfd.flag(i);
|
||||
positions[ti] = tfd.position(i);
|
||||
flags[ti] = tfd.flag(pos);
|
||||
positions[ti] = tfd.position(pos);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Ignore documents that don't match the mandatory constraints
|
||||
if (!searchTerms.phraseConstraints.testMandatory(positions)) {
|
||||
continue;
|
||||
}
|
||||
public int size() {
|
||||
return resultIds.size();
|
||||
}
|
||||
|
||||
if (!exportDebugData) {
|
||||
var score = resultRanker.calculateScore(arena, null, resultIds.at(i), searchTerms, flags, positions);
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
else {
|
||||
var rankingFactors = new DebugRankingFactors();
|
||||
var score = resultRanker.calculateScore(arena, rankingFactors, resultIds.at(i), searchTerms, flags, positions);
|
||||
if (score != null) {
|
||||
score.debugRankingFactors = rankingFactors;
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
public void close() {
|
||||
if (closed.compareAndSet(false, true)) {
|
||||
arena.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public List<SearchResultItem> rankResults(
|
||||
IndexSearchBudget budget,
|
||||
ResultRankingContext rankingContext,
|
||||
RankingData rankingData,
|
||||
boolean exportDebugData)
|
||||
{
|
||||
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext);
|
||||
|
||||
List<SearchResultItem> results = new ArrayList<>(rankingData.size());
|
||||
|
||||
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
||||
// term data arrays as well
|
||||
|
||||
var searchTerms = rankingData.searchTerms;
|
||||
|
||||
while (rankingData.next() && budget.hasTimeLeft()) {
|
||||
|
||||
// Ignore documents that don't match the mandatory constraints
|
||||
if (!searchTerms.phraseConstraints.testMandatory(rankingData.positions())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return results;
|
||||
if (!exportDebugData) {
|
||||
var score = resultRanker.calculateScore(null, rankingData.resultId(), searchTerms, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
else {
|
||||
var rankingFactors = new DebugRankingFactors();
|
||||
var score = resultRanker.calculateScore( rankingFactors, rankingData.resultId(), searchTerms, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
|
||||
|
||||
if (score != null) {
|
||||
score.debugRankingFactors = rankingFactors;
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
public List<RpcDecoratedResultItem> selectBestResults(SearchParameters params,
|
||||
public List<RpcDecoratedResultItem> selectBestResults(int limitByDomain,
|
||||
int limitTotal,
|
||||
ResultRankingContext resultRankingContext,
|
||||
Collection<SearchResultItem> results) throws SQLException {
|
||||
List<SearchResultItem> results) throws SQLException {
|
||||
|
||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
||||
var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);
|
||||
|
||||
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
|
||||
TLongList idsList = new TLongArrayList(params.limitTotal);
|
||||
TLongList idsList = new TLongArrayList(limitTotal);
|
||||
|
||||
for (var item : results) {
|
||||
if (domainCountFilter.test(item)) {
|
||||
|
||||
if (resultsList.size() < params.limitTotal) {
|
||||
if (resultsList.size() < limitTotal) {
|
||||
resultsList.add(item);
|
||||
idsList.add(item.getDocumentId());
|
||||
}
|
||||
@@ -159,19 +212,26 @@ public class IndexResultRankingService {
|
||||
// for the selected results, as this would be comically expensive to do for all the results we
|
||||
// discard along the way
|
||||
|
||||
if (params.rankingParams.getExportDebugData()) {
|
||||
if (resultRankingContext.params.getExportDebugData()) {
|
||||
var combinedIdsList = new LongArrayList(resultsList.size());
|
||||
for (var item : resultsList) {
|
||||
combinedIdsList.add(item.combinedId);
|
||||
}
|
||||
|
||||
resultsList.clear();
|
||||
resultsList.addAll(this.rankResults(
|
||||
params,
|
||||
true,
|
||||
resultRankingContext,
|
||||
new CombinedDocIdList(combinedIdsList))
|
||||
);
|
||||
IndexSearchBudget budget = new IndexSearchBudget(10000);
|
||||
try (var data = prepareRankingData(resultRankingContext, new CombinedDocIdList(combinedIdsList), null)) {
|
||||
resultsList.addAll(this.rankResults(
|
||||
budget,
|
||||
resultRankingContext,
|
||||
data,
|
||||
true)
|
||||
);
|
||||
}
|
||||
catch (TimeoutException ex) {
|
||||
// this won't happen since we passed null for budget
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Fetch the document details for the selected results in one go, from the local document database
|
||||
@@ -247,7 +307,7 @@ public class IndexResultRankingService {
|
||||
|
||||
var termOutputs = RpcResultTermRankingOutputs.newBuilder();
|
||||
|
||||
CqDataLong termIds = params.compiledQueryIds.data;;
|
||||
CqDataLong termIds = resultRankingContext.compiledQueryIds.data;
|
||||
|
||||
for (var entry : debugFactors.getTermFactors()) {
|
||||
String term = "[ERROR IN LOOKUP]";
|
||||
@@ -255,7 +315,7 @@ public class IndexResultRankingService {
|
||||
// CURSED: This is a linear search, but the number of terms is small, and it's in a debug path
|
||||
for (int i = 0; i < termIds.size(); i++) {
|
||||
if (termIds.get(i) == entry.termId()) {
|
||||
term = params.compiledQuery.at(i);
|
||||
term = resultRankingContext.compiledQuery.at(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@@ -6,14 +6,13 @@ import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
@@ -28,7 +27,6 @@ import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.sequence.SequenceOperations;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.util.BitSet;
|
||||
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
||||
@@ -47,24 +45,23 @@ public class IndexResultScoreCalculator {
|
||||
|
||||
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
||||
DomainRankingOverrides domainRankingOverrides,
|
||||
ResultRankingContext rankingContext,
|
||||
SearchParameters params)
|
||||
ResultRankingContext rankingContext)
|
||||
{
|
||||
this.index = statefulIndex.get();
|
||||
this.domainRankingOverrides = domainRankingOverrides;
|
||||
this.rankingContext = rankingContext;
|
||||
|
||||
this.queryParams = params.queryParams;
|
||||
this.compiledQuery = params.compiledQuery;
|
||||
this.queryParams = rankingContext.queryParams;
|
||||
this.compiledQuery = rankingContext.compiledQuery;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public SearchResultItem calculateScore(Arena arena,
|
||||
@Nullable DebugRankingFactors debugRankingFactors,
|
||||
public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
|
||||
long combinedId,
|
||||
QuerySearchTerms searchTerms,
|
||||
long[] wordFlags,
|
||||
CodedSequence[] positions)
|
||||
CodedSequence[] positions,
|
||||
DocumentSpans spans)
|
||||
{
|
||||
|
||||
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||
@@ -92,8 +89,6 @@ public class IndexResultScoreCalculator {
|
||||
int docSize = index.getDocumentSize(docId);
|
||||
if (docSize <= 0) docSize = 5000;
|
||||
|
||||
DocumentSpans spans = index.getDocumentSpans(arena, docId);
|
||||
|
||||
if (debugRankingFactors != null) {
|
||||
debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
|
||||
debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
|
||||
@@ -235,7 +230,7 @@ public class IndexResultScoreCalculator {
|
||||
long result = 0;
|
||||
int bit = 0;
|
||||
|
||||
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions).intIterator();
|
||||
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions, 64).intIterator();
|
||||
|
||||
while (intersection.hasNext() && bit < 64) {
|
||||
bit = (int) (Math.sqrt(intersection.nextInt()));
|
||||
@@ -551,9 +546,18 @@ public class IndexResultScoreCalculator {
|
||||
largeSiteFactor = 2;
|
||||
}
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.ADVERTISEMENT.getFeatureBit()))
|
||||
penalty += 7.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.CONSENT.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.POPOVER.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
|
||||
penalty += 5.0 * largeSiteFactor;
|
||||
|
||||
@@ -563,6 +567,9 @@ public class IndexResultScoreCalculator {
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.SHORT_DOCUMENT.getFeatureBit()))
|
||||
penalty += 2.5 * largeSiteFactor;
|
||||
|
||||
if (isForum || isWiki) {
|
||||
penalty = Math.min(0, penalty - 2);
|
||||
}
|
||||
|
@@ -3,7 +3,7 @@ package nu.marginalia.index.results;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.List;
|
||||
|
@@ -58,6 +58,7 @@ public class PhraseConstraintGroupList {
|
||||
private final int[] offsets;
|
||||
private final BitSet present;
|
||||
private final BitSet termIdsMask;
|
||||
private final int presentCardinality;
|
||||
|
||||
public final int size;
|
||||
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
|
||||
@@ -85,6 +86,8 @@ public class PhraseConstraintGroupList {
|
||||
termIdsMask.set(idx);
|
||||
}
|
||||
}
|
||||
|
||||
presentCardinality = present.cardinality();
|
||||
}
|
||||
|
||||
/** Returns true if the term with index termIdx in the query is in the group */
|
||||
@@ -93,7 +96,7 @@ public class PhraseConstraintGroupList {
|
||||
}
|
||||
|
||||
public boolean test(CodedSequence[] positions) {
|
||||
IntIterator[] sequences = new IntIterator[present.cardinality()];
|
||||
IntIterator[] sequences = new IntIterator[presentCardinality];
|
||||
|
||||
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||
if (!present.get(oi)) {
|
||||
@@ -120,7 +123,7 @@ public class PhraseConstraintGroupList {
|
||||
|
||||
|
||||
public IntList findIntersections(IntList[] positions) {
|
||||
IntList[] sequences = new IntList[present.cardinality()];
|
||||
IntList[] sequences = new IntList[presentCardinality];
|
||||
int[] iterOffsets = new int[sequences.length];
|
||||
|
||||
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||
@@ -144,12 +147,41 @@ public class PhraseConstraintGroupList {
|
||||
iterOffsets[si - 1] = -oi;
|
||||
}
|
||||
|
||||
return SequenceOperations.findIntersections(sequences, iterOffsets);
|
||||
return SequenceOperations.findIntersections(sequences, iterOffsets, Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
|
||||
public IntList findIntersections(IntList[] positions, int n) {
|
||||
IntList[] sequences = new IntList[presentCardinality];
|
||||
int[] iterOffsets = new int[sequences.length];
|
||||
|
||||
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||
if (!present.get(oi)) {
|
||||
continue;
|
||||
}
|
||||
int offset = offsets[oi];
|
||||
if (offset < 0)
|
||||
return IntList.of();
|
||||
|
||||
// Create iterators that are offset by their relative position in the
|
||||
// sequence. This is done by subtracting the index from the offset,
|
||||
// so that when we intersect them, an overlap means that the terms are
|
||||
// in the correct order. Note the offset is negative!
|
||||
|
||||
var posForTerm = positions[offset];
|
||||
if (posForTerm == null) {
|
||||
return IntList.of();
|
||||
}
|
||||
sequences[si++] = posForTerm;
|
||||
iterOffsets[si - 1] = -oi;
|
||||
}
|
||||
|
||||
return SequenceOperations.findIntersections(sequences, iterOffsets, n);
|
||||
}
|
||||
|
||||
public int minDistance(IntList[] positions) {
|
||||
List<IntList> sequences = new ArrayList<>(present.cardinality());
|
||||
IntList iterOffsets = new IntArrayList(present.cardinality());
|
||||
List<IntList> sequences = new ArrayList<>(presentCardinality);
|
||||
IntList iterOffsets = new IntArrayList(presentCardinality);
|
||||
|
||||
for (int oi = 0; oi < offsets.length; oi++) {
|
||||
if (!present.get(oi)) {
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.index.results.model.ids;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.LongStream;
|
||||
@@ -17,17 +17,23 @@ public final class CombinedDocIdList {
|
||||
public CombinedDocIdList(long... data) {
|
||||
this.data = Arrays.copyOf(data, data.length);
|
||||
}
|
||||
|
||||
public CombinedDocIdList(LongQueryBuffer buffer) {
|
||||
this.data = buffer.copyData();
|
||||
}
|
||||
public CombinedDocIdList(LongArrayList data) {
|
||||
this.data = data.toLongArray();
|
||||
}
|
||||
public CombinedDocIdList(Roaring64Bitmap data) {
|
||||
this.data = data.toArray();
|
||||
}
|
||||
public CombinedDocIdList() {
|
||||
this.data = new long[0];
|
||||
}
|
||||
|
||||
public static CombinedDocIdList combineLists(CombinedDocIdList one, CombinedDocIdList other) {
|
||||
long[] data = new long[one.size() + other.size()];
|
||||
System.arraycopy(one.data, 0, data, 0, one.data.length);
|
||||
System.arraycopy(other.data, 0, data, one.data.length, other.data.length);
|
||||
return new CombinedDocIdList(data);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return data.length;
|
||||
}
|
||||
|
@@ -6,7 +6,7 @@ import java.util.Arrays;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public final class TermIdList {
|
||||
private final long[] array;
|
||||
public final long[] array;
|
||||
|
||||
public TermIdList(long[] array) {
|
||||
this.array = array;
|
||||
|
@@ -59,7 +59,7 @@ public class RankingSearchSet implements SearchSet {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(int domainId, long documentMetadata) {
|
||||
public boolean contains(int domainId) {
|
||||
|
||||
// This is the main check
|
||||
if (set.contains(domainId) || set.isEmpty()) {
|
||||
|
@@ -7,6 +7,10 @@ public interface SearchSet {
|
||||
* or if the documentMetadata vibes with the set
|
||||
*
|
||||
*/
|
||||
boolean contains(int domainId, long documentMetadata);
|
||||
boolean contains(int domainId);
|
||||
|
||||
default boolean imposesConstraint() {
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -2,7 +2,7 @@ package nu.marginalia.index.searchset;
|
||||
|
||||
public class SearchSetAny implements SearchSet {
|
||||
@Override
|
||||
public boolean contains(int domainId, long meta) {
|
||||
public boolean contains(int domainId) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -10,4 +10,9 @@ public class SearchSetAny implements SearchSet {
|
||||
public String toString() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean imposesConstraint() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@@ -6,14 +6,14 @@ import gnu.trove.list.TIntList;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import nu.marginalia.db.DomainRankingSetsService;
|
||||
import nu.marginalia.db.DomainTypes;
|
||||
import nu.marginalia.domainranking.PageRankDomainRanker;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.domainranking.accumulator.RankingResultHashSetAccumulator;
|
||||
import nu.marginalia.domainranking.data.GraphSource;
|
||||
import nu.marginalia.domainranking.data.LinkGraphSource;
|
||||
import nu.marginalia.domainranking.data.SimilarityGraphSource;
|
||||
import nu.marginalia.index.IndexFactory;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.ranking.domains.PageRankDomainRanker;
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.ranking.domains.accumulator.RankingResultHashSetAccumulator;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import nu.marginalia.ranking.domains.data.LinkGraphSource;
|
||||
import nu.marginalia.ranking.domains.data.SimilarityGraphSource;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
|
@@ -14,7 +14,7 @@ public class SmallSearchSet implements SearchSet {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(int domainId, long meta) {
|
||||
public boolean contains(int domainId) {
|
||||
return entries.contains(domainId);
|
||||
}
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.index.query;
|
||||
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -18,15 +18,16 @@ import java.util.stream.Collectors;
|
||||
public class IndexQuery {
|
||||
private final List<EntrySource> sources;
|
||||
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
|
||||
private boolean prioritize = false;
|
||||
|
||||
public IndexQuery(List<EntrySource> sources)
|
||||
public IndexQuery(EntrySource source, boolean prioritize)
|
||||
{
|
||||
this.sources = sources;
|
||||
this.sources = List.of(source);
|
||||
this.prioritize = prioritize;
|
||||
}
|
||||
|
||||
public IndexQuery(EntrySource... sources)
|
||||
{
|
||||
this.sources = List.of(sources);
|
||||
public boolean isPrioritized() {
|
||||
return prioritize;
|
||||
}
|
||||
/** Adds a filter to the query. The filter will be applied to the results
|
||||
* after they are read from the sources.
|
||||
@@ -60,6 +61,7 @@ public class IndexQuery {
|
||||
if (!fillBuffer(dest))
|
||||
return;
|
||||
|
||||
|
||||
for (var filter : inclusionFilter) {
|
||||
filter.apply(dest);
|
||||
|
||||
@@ -73,6 +75,8 @@ public class IndexQuery {
|
||||
|
||||
private boolean fillBuffer(LongQueryBuffer dest) {
|
||||
for (;;) {
|
||||
dest.zero();
|
||||
|
||||
EntrySource source = sources.get(si);
|
||||
source.read(dest);
|
||||
|
||||
@@ -102,6 +106,7 @@ public class IndexQuery {
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@@ -2,8 +2,6 @@ package nu.marginalia.index.query;
|
||||
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/** Builds a query.
|
||||
* <p />
|
||||
* Note: The query builder may omit predicates that are deemed redundant.
|
||||
@@ -11,14 +9,13 @@ import java.util.List;
|
||||
public interface IndexQueryBuilder {
|
||||
/** Filters documents that also contain termId, within the full index.
|
||||
*/
|
||||
IndexQueryBuilder also(long termId);
|
||||
IndexQueryBuilder also(long termId, IndexSearchBudget budget);
|
||||
|
||||
/** Excludes documents that contain termId, within the full index
|
||||
*/
|
||||
IndexQueryBuilder not(long termId);
|
||||
IndexQueryBuilder not(long termId, IndexSearchBudget budget);
|
||||
|
||||
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
||||
IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterStep);
|
||||
|
||||
IndexQuery build();
|
||||
}
|
||||
|
@@ -10,5 +10,5 @@ public class IndexSearchBudget {
|
||||
}
|
||||
|
||||
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
|
||||
public long timeLeft() { return timeout - System.currentTimeMillis(); }
|
||||
public long timeLeft() { return Math.max(0, timeout - System.currentTimeMillis()); }
|
||||
}
|
||||
|
@@ -1,71 +0,0 @@
|
||||
package nu.marginalia.index.query.filter;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class QueryFilterAllOf implements QueryFilterStepIf {
|
||||
private final List<QueryFilterStepIf> steps;
|
||||
|
||||
public QueryFilterAllOf(List<? extends QueryFilterStepIf> steps) {
|
||||
this.steps = new ArrayList<>(steps.size());
|
||||
|
||||
for (var step : steps) {
|
||||
if (step instanceof QueryFilterAllOf allOf) {
|
||||
this.steps.addAll(allOf.steps);
|
||||
}
|
||||
else {
|
||||
this.steps.add(step);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public QueryFilterAllOf(QueryFilterStepIf... steps) {
|
||||
this(List.of(steps));
|
||||
}
|
||||
|
||||
public double cost() {
|
||||
double prod = 1.;
|
||||
|
||||
for (var step : steps) {
|
||||
double cost = step.cost();
|
||||
if (cost > 1.0) {
|
||||
prod *= Math.log(cost);
|
||||
}
|
||||
else {
|
||||
prod += cost;
|
||||
}
|
||||
}
|
||||
|
||||
return prod;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
for (var step : steps) {
|
||||
if (!step.test(value))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
if (steps.isEmpty())
|
||||
return;
|
||||
|
||||
for (var step : steps) {
|
||||
step.apply(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
StringJoiner sj = new StringJoiner(",", "[All Of: ", "]");
|
||||
for (var step : steps) {
|
||||
sj.add(step.describe());
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user