1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

134 Commits

Author SHA1 Message Date
Viktor Lofgren
c91cf49630 (search) Disable scribe.rip substitution
It does not appear to work well
2025-07-27 19:40:58 +02:00
Viktor Lofgren
8503030f18 (search) Fix rare exception in scribe.rip substitution 2025-07-27 19:38:52 +02:00
Viktor Lofgren
744f7d3ef7 (search) Fix rare exception in scribe.rip substitution 2025-07-27 19:34:03 +02:00
Viktor Lofgren
215e12afe9 (index) Shrink query buffer size 2025-07-27 17:33:46 +02:00
Viktor Lofgren
2716bce918 (index) Adjust timeout logic for evaluation 2025-07-27 17:28:34 +02:00
Viktor Lofgren
caf2e6fbb7 (index) Adjust timeout logic for evaluation 2025-07-27 17:27:07 +02:00
Viktor Lofgren
233f0acfb1 (index) Further reduce query buffer size 2025-07-27 17:13:08 +02:00
Viktor Lofgren
e3a4ff02e9 (index) Abandon ongoing evaluation tasks if time is up 2025-07-27 17:04:01 +02:00
Viktor Lofgren
c786283ae1 (index) Reduce quer buffer size 2025-07-27 16:57:55 +02:00
Viktor Lofgren
a3f65ac0e0 (deploy) Trigger index deployment 2025-07-27 16:50:23 +02:00
Viktor
aba1a32af0 Merge pull request #217 from MarginaliaSearch/uncompressed-spans-file
Index optimizations
2025-07-27 16:49:27 +02:00
Viktor Lofgren
c9c442345b (perf) Change execution test to use processing rate instead of count 2025-07-27 16:39:51 +02:00
Viktor Lofgren
2e126ba30e (perf) Change execution test to use processing rate instead of count 2025-07-27 16:37:20 +02:00
Viktor Lofgren
2087985f49 (index) Implement work stealing in IndexQueryExecution as a better approach to backpressure 2025-07-27 16:29:57 +02:00
Viktor Lofgren
2b13ebd18b (index) Tweak evaluation backlog handling 2025-07-27 16:08:16 +02:00
Viktor Lofgren
6d92c125fe (perf) Fix perf test 2025-07-27 15:50:28 +02:00
Viktor Lofgren
f638cfa39a (index) Avoid possibility of negative timeout 2025-07-27 15:39:12 +02:00
Viktor Lofgren
89447c12af (index) Avoid possibility of negative timeout 2025-07-27 15:24:47 +02:00
Viktor Lofgren
c71fc46f04 (perf) Update perf test with execution scenario 2025-07-27 15:22:07 +02:00
Viktor Lofgren
f96874d828 (sequence) Implement a largestValue abort condition for minDistance()
This is something like 3500% faster in certain common scenarios
2025-07-27 15:05:50 +02:00
Viktor Lofgren
583a84d5a0 (index) Clean up of the index query execution logic 2025-07-27 15:05:50 +02:00
Viktor Lofgren
f65b946448 (index) Clean up code 2025-07-27 15:05:50 +02:00
Viktor Lofgren
3682815855 (index) Optimize sequence intersection for the n=1 case 2025-07-26 19:14:32 +02:00
Viktor Lofgren
3a94357660 (index) Perf test tool (WIP!) 2025-07-26 11:49:33 +02:00
Viktor Lofgren
673b0d3de1 (index) Perf test tool (WIP!) 2025-07-26 11:49:31 +02:00
Viktor Lofgren
ea942bc664 (spans) Add signature to the footer of the spans file, including a version byte so we can detect whether ot use the old or new decoding logic 2025-07-25 12:07:18 +02:00
Viktor Lofgren
7ed5083c54 (index) Don't split results into chunks 2025-07-25 11:45:07 +02:00
Viktor Lofgren
08bb2c097b (refac) Clean up the data model used in the index service 2025-07-25 10:54:07 +02:00
Viktor Lofgren
495fb325be (sequence) Correct sequence intersection bug introduced in optimizations 2025-07-25 10:48:33 +02:00
Viktor Lofgren
05c25bbaec (chore) Clean up 2025-07-24 23:43:27 +02:00
Viktor Lofgren
2a028b84f3 (chore) Clean up 2025-07-24 20:12:56 +02:00
Viktor Lofgren
a091a23623 (ranking) Remove unnecessary metadata retrievals 2025-07-24 20:08:09 +02:00
Viktor Lofgren
e8897acb45 (ranking) Remove unnecessary metadata retrievals 2025-07-24 20:05:39 +02:00
Viktor Lofgren
b89ffcf2be (index) Evaluate hash based idx mapping in ForwardIndexReader 2025-07-24 19:47:27 +02:00
Viktor Lofgren
dbcc9055b0 (index) Evaluate using MinMaxPriorityQueue as guts of ResultPriorityQueue 2025-07-24 19:31:51 +02:00
Viktor Lofgren
d9740557f4 (sequence) Optimize intersection logic with a fast abort condition 2025-07-24 19:04:10 +02:00
Viktor Lofgren
0d6cd015fd (index) Evaluate reading all spans at once 2025-07-24 18:34:11 +02:00
Viktor Lofgren
c6034efcc8 (index) Cache value of bitset cardinality for speed 2025-07-24 17:24:55 +02:00
Viktor Lofgren
76068014ad (index) More spans optimizations 2025-07-24 15:03:43 +02:00
Viktor Lofgren
1c3ed67127 (index) Byte align document spans 2025-07-24 14:06:14 +02:00
Viktor Lofgren
fc0cb6bd9a (index) Reserve a larger size for IntArrayList in SeqenceOperations.findIntersections 2025-07-24 14:03:44 +02:00
Viktor Lofgren
c2601bac78 (converter) Remove unnecessary allocation of a 16 KB byte buffer 2025-07-24 13:25:37 +02:00
Viktor Lofgren
f5641b72e9 (index) Fix broken test 2025-07-24 13:21:05 +02:00
Viktor Lofgren
36efe2e219 (index) Optimize PositionsFileReader for concurrent reads
In benchmarks this is roughly twice as fast as the previous approach.  Main caveat being we need multiple file descriptors to avoid read instruction serialization by the kernel.  This is undesirable since the reads are complete scattershot and can't be reordered by the kernel in a way that optimizes anything.
2025-07-24 13:20:54 +02:00
Viktor Lofgren
983fe3829e (spans) Evaluate uncompressed spans files
Span decompression appears to be somewhat of a performance bottleneck.  This change removes compression of the spans file.  The spans are still compressed in transit between the converter and index constructor at this stage.  The change is intentionally kept small to just evaluate the performance implications, change in file sizes, etc.
2025-07-23 18:10:41 +02:00
Viktor Lofgren
668c87aa86 (ssr) Drop Executor from SSR as it no longer exists 2025-07-23 13:55:41 +02:00
Viktor Lofgren
9d3f9adb05 Force redeploy of everything 2025-07-23 13:36:02 +02:00
Viktor
a43a1773f1 Merge pull request #216 from MarginaliaSearch/deprecate-executor
Architecture: Remove the separate executor service and roll it into the index service.
2025-07-23 13:32:42 +02:00
Viktor Lofgren
1e7a3a3c4f (docs) Update docs to reflect the change 2025-07-23 13:18:23 +02:00
Viktor Lofgren
62b696b1c3 (architecture) Remove the separate executor service and merge it into the index service
The primary motivation for this is that in production, the large number of partitioned services has lead to an intermittent exhaustion of available database connections, as each service has a connection pool.

The decision to have a separate executor service dates back from when the index service was very slow to start, and the executor didn't always spin off its memory-hungry tasks into separate processes, which meant the executor would sometimes OOM and crash, and it was undesirable to bring the index down with it.
2025-07-23 12:57:13 +02:00
Viktor Lofgren
f1a900f383 (search) Clean up front page mobile design a bit 2025-07-23 12:20:40 +02:00
Viktor Lofgren
700364b86d (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 15:08:20 +02:00
Viktor Lofgren
7e725ddaed (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 14:41:59 +02:00
Viktor Lofgren
120209e138 (sample) Diagnosing compression errors 2025-07-21 14:34:08 +02:00
Viktor Lofgren
a771a5b6ce (sample) Test different approach to decoding 2025-07-21 14:19:01 +02:00
Viktor Lofgren
dac5b54128 (sample) Better logging for sample errors 2025-07-21 14:03:58 +02:00
Viktor Lofgren
6cfb143c15 (sample) Compress sample HTML data and introduce new API for only getting requests 2025-07-21 13:55:25 +02:00
Viktor Lofgren
23c818281b (converter) Reduce DomSample logging for NOT_FOUND 2025-07-21 13:37:55 +02:00
Viktor Lofgren
8aad253cf6 (converter) Add more logging around dom sample data retrieval errors 2025-07-21 13:26:38 +02:00
Viktor Lofgren
556d7af9dc Reapply "(grpc) Use grpc-netty instead of grpc-netty-shaded"
This reverts commit b7a5219ed3.
2025-07-21 13:23:32 +02:00
Viktor Lofgren
b7a5219ed3 Revert "(grpc) Use grpc-netty instead of grpc-netty-shaded"
Reverting this change to see if it's the cause of some instability issues observed.
2025-07-21 13:10:41 +02:00
Viktor Lofgren
a23ec521fe (converter) Ensure features is mutable on DetailsWithWords as this is assumed later 2025-07-21 12:50:04 +02:00
Viktor Lofgren
fff3babc6d (classier) Add rule for */pixel.gif as likely tracking pixels 2025-07-21 12:35:57 +02:00
Viktor Lofgren
b2bfb8217c (special) Trigger CD run 2025-07-21 12:28:24 +02:00
Viktor
3b2ac414dc Merge pull request #210 from MarginaliaSearch/ads-fingerprinting
Implement advertisement and popover identification based on DOM sample data
2025-07-21 12:25:31 +02:00
Viktor Lofgren
0ba6515a01 (converter) Ensure converter works well even when dom sample data is unavailable 2025-07-21 12:11:17 +02:00
Viktor Lofgren
16c6b0f151 (search) Add link to new discord community 2025-07-20 20:54:42 +02:00
Viktor Lofgren
e998692900 (converter) Ensure converter works well even when dom sample data is unavailable 2025-07-20 19:24:40 +02:00
Viktor Lofgren
eeb1695a87 (search) Clean up dead code 2025-07-20 19:15:01 +02:00
Viktor Lofgren
a0ab910940 (search) Clean up code 2025-07-20 19:14:13 +02:00
Viktor Lofgren
b9f31048d7 (search) Clean up overlong class names 2025-07-20 19:13:04 +02:00
Viktor Lofgren
12c304289a (grpc) Use grpc-netty instead of grpc-netty-shaded
This will help reduce runaway thread pool sizes
2025-07-20 17:36:25 +02:00
Viktor Lofgren
6ee01dabea (search) Drastically reduce worker thread count in search-service 2025-07-20 17:16:58 +02:00
Viktor Lofgren
1b80e282a7 (search) Drastically reduce worker thread count in search-service 2025-07-20 16:58:33 +02:00
Viktor Lofgren
a65d18f1d1 (client) Use virtual threads in a few more clients 2025-07-20 14:10:02 +02:00
Viktor Lofgren
90a1ff220b (ui) Clean up UI 2025-07-19 18:41:36 +02:00
Viktor Lofgren
d6c7092335 (classifier) More rules 2025-07-19 18:41:36 +02:00
Viktor Lofgren
b716333856 (classifier) Match regexes against the path + query only, as well as the full URL 2025-07-19 18:41:36 +02:00
Viktor Lofgren
b504b8482c (classifier) Add new tracker 2025-07-19 18:41:36 +02:00
Viktor Lofgren
80da1e9ad1 (ui) UI cleanup 2025-07-19 18:41:36 +02:00
Viktor Lofgren
d3f744a441 (ui) Add traffic report to overview menu 2025-07-19 18:41:36 +02:00
Viktor Lofgren
60fb539875 (ui) Add explanatory blurb 2025-07-19 18:41:35 +02:00
Viktor Lofgren
7f5094fedf (ui) Clean up UI 2025-07-19 18:41:35 +02:00
Viktor Lofgren
45066636a5 (classifier) Add classification for domains that make 3rd party requests 2025-07-19 18:41:35 +02:00
Viktor Lofgren
e2d6898c51 (search) Change tag colors to more pleasant ones 2025-07-19 18:41:35 +02:00
Viktor Lofgren
58ef767b94 (search) Improve traffic report UI 2025-07-19 18:41:35 +02:00
Viktor Lofgren
f9f268c67a (grpc) Improve error handling 2025-07-19 18:41:35 +02:00
Viktor Lofgren
f44c2bdee9 (chore) Cleanup 2025-07-19 18:41:35 +02:00
Viktor Lofgren
6fdf477c18 (refac) Move DomSampleClassification to top level 2025-07-19 18:41:35 +02:00
Viktor Lofgren
6b6e455e3f (classifier) Clean up xml 2025-07-19 18:41:35 +02:00
Viktor Lofgren
a3a126540c (classifier) Add README.md 2025-07-19 18:41:35 +02:00
Viktor Lofgren
842b19da40 (search) Mobile layout + phrasing 2025-07-19 18:41:35 +02:00
Viktor Lofgren
2a30e93bf0 (classifier) 2025-07-19 18:41:34 +02:00
Viktor Lofgren
3d998f12c0 (search) Use display name where possible 2025-07-19 18:41:34 +02:00
Viktor Lofgren
cbccc2ac23 (classification) Add /ccm/collect as an ads-related request 2025-07-19 18:41:34 +02:00
Viktor Lofgren
2cfc23f9b7 (search) Fix layout for mobile 2025-07-18 19:06:23 +02:00
Viktor Lofgren
88fe394cdb (request-classifier) Add rule for /pagead/ 2025-07-18 19:01:33 +02:00
Viktor Lofgren
f30fcebd4f Remove dead code 2025-07-18 18:56:42 +02:00
Viktor Lofgren
5d885927b4 (search) Fix layout and presentation 2025-07-18 17:54:47 +02:00
Viktor Lofgren
7622c8358e (request-classifier) Adjust flagging of a few hosts 2025-07-18 17:54:46 +02:00
Viktor Lofgren
69ed9aef47 (ddgt) Load global tracker data 2025-07-18 17:02:50 +02:00
Viktor Lofgren
4c78c223da (search) Fix endpoint collection 2025-07-18 16:59:05 +02:00
Viktor Lofgren
71b9935dd6 (search) Add warmup to programmatic tailwind classes, fix word break 2025-07-18 16:49:31 +02:00
Viktor Lofgren
ad38f2fd83 (search) Hide classification tag on unclassified requests 2025-07-18 15:45:40 +02:00
Viktor Lofgren
9c47388846 (search) Improve display ordering 2025-07-18 15:44:55 +02:00
Viktor Lofgren
d9ab10e33f (search) Fix tracker data for the correct domain 2025-07-18 15:29:15 +02:00
Viktor Lofgren
e13ea7f42b (search) Sort results by classifications 2025-07-18 14:51:35 +02:00
Viktor Lofgren
f38daeb036 (WIP) First stab at a GUI for viewing network traffic
The change also moves the dom classifier to a separate package so that it can be accessed from both the search service and converter.

The change also adds a parser for DDG's tracker radar data.
2025-07-18 13:58:57 +02:00
Viktor Lofgren
6e214293e5 (ping) Fix backoff value overflow 2025-07-16 19:50:12 +02:00
Viktor Lofgren
52582a6d7d (experiment) Also add clients to loom experiment 2025-07-16 18:08:00 +02:00
Viktor Lofgren
ec0e39ad32 (experiment) Also add clients to loom experiment 2025-07-16 17:28:57 +02:00
Viktor Lofgren
6a15aee4b0 (ping) Fix arithmetic errors in backoff strategy due to long overflow 2025-07-16 17:23:36 +02:00
Viktor Lofgren
bd5111e8a2 (experimental) Add flag for using loom/virtual threads in gRPC executor 2025-07-16 17:12:07 +02:00
Viktor Lofgren
1ecbeb0272 (doc) Update ROADMAP.md 2025-07-14 13:38:34 +02:00
Viktor Lofgren
b91354925d (converter) Index documents even when they are short
... but assign short documents a special flag and penalize them in index lookups
2025-07-14 12:24:25 +02:00
Viktor Lofgren
3f85c9c154 (refac) Clean up code 2025-07-14 11:55:21 +02:00
Viktor Lofgren
390f053406 (api) Add query parameter 'dc' for specifying the max number of results per domain 2025-07-14 10:09:30 +02:00
Viktor Lofgren
89e03d6914 (chore) Idiomatic error handling in gRPC clients
responseObserver.onError(...) should be passed Status.WHATEVER.foo().asRuntimeException() and not random throwables as was done before.
2025-07-13 02:59:22 +02:00
Viktor Lofgren
14e0bc9f26 (index) Add comment about encoding caveat 2025-07-13 02:47:00 +02:00
Viktor Lofgren
7065b46c6f (index) Add penalties for new feature flags from dom sample 2025-07-13 02:37:30 +02:00
Viktor Lofgren
0372190c90 (index, refac) Move domain ranking to a better named package 2025-07-13 02:37:29 +02:00
Viktor Lofgren
ceaf32fb90 (converter) Integrate dom sample features into the converter 2025-07-13 01:38:28 +02:00
Viktor Lofgren
b03c43224c (search) Fix redirects in new search UI 2025-07-11 23:44:45 +02:00
Viktor Lofgren
9b4ce9e9eb (search) Fix !w redirect 2025-07-11 23:28:09 +02:00
Viktor
81ac02a695 Merge pull request #209 from us3r1d/master
added converter.insertFoundDomains property
2025-07-11 21:34:04 +02:00
krystal
47f624fb3b changed converter.insertFoundDomains to loader.insertFoundDomains 2025-07-11 12:13:45 -07:00
Viktor Lofgren
b57db01415 (converter) Clean out some old and redundant advertisement and tracking detection code 2025-07-11 19:32:25 +02:00
Viktor Lofgren
ce7d522608 (converter) First basic hook-in of the new dom sample classifier into the converter workflow 2025-07-11 16:57:37 +02:00
Viktor Lofgren
18649b6ee9 (converter) Move DomSampleClassifier to converter's code tree 2025-07-11 16:12:48 +02:00
Viktor Lofgren
f6417aef1a (converter) Additional code cleanup 2025-07-11 15:58:48 +02:00
Viktor Lofgren
2aa7e376b0 (converter) Clean up code around document deduplication 2025-07-11 15:54:28 +02:00
Viktor Lofgren
f33bc44860 (dom-sample) Create API for fetching DOM sample data across services 2025-07-11 15:41:10 +02:00
Viktor Lofgren
a2826efd44 (dom-sample) First stab at classifying outgoing requests from DOM sample data 2025-07-11 15:41:10 +02:00
krystal
c866f19cbb added converter.insertFoundDomains property 2025-07-10 15:36:59 -07:00
161 changed files with 3265 additions and 1274 deletions

View File

@@ -48,10 +48,6 @@ filter for any API consumer.
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
## Show favicons next to search results
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
## Specialized crawler for github
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
that direction as well.
## Show favicons next to search results (COMPLETED 2025-03)
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
## Web Design Overhaul (COMPLETED 2025-01)
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.

View File

@@ -5,13 +5,15 @@ import java.util.Collection;
public enum HtmlFeature {
// Note, the first 32 of these features are bit encoded in the database
// so be sure to keep anything that's potentially important toward the top
// of the list
// of the list; but adding new values will shift the encoded values and break
// binary compatibility! Scroll down for a marker where you should add new values
// if they need to be accessible from IndexResultScoreCalculator!
MEDIA( "special:media"),
JS("special:scripts"),
AFFILIATE_LINK( "special:affiliate"),
TRACKING("special:tracking"),
TRACKING_ADTECH("special:ads"), // We'll call this ads for now
TRACKING_ADTECH("special:adtech"),
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
LONG_URL("special:longurl"),
@@ -30,6 +32,15 @@ public enum HtmlFeature {
PDF("format:pdf"),
POPOVER("special:popover"),
CONSENT("special:consent"),
SHORT_DOCUMENT("special:shorty"),
THIRD_PARTY_REQUESTS("special:3pr"),
// Here! It is generally safe to add additional values here without
// disrupting the encoded values used by the DocumentValuator
// class in the index!
/** For fingerprinting and ranking */
OPENGRAPH("special:opengraph"),
OPENGRAPH_IMAGE("special:opengraph:image"),
@@ -67,6 +78,7 @@ public enum HtmlFeature {
S3_FEATURE("special:s3"),
MISSING_DOM_SAMPLE("special:nosample"),
UNKNOWN("special:uncategorized");

View File

@@ -7,7 +7,6 @@ public enum ServiceId {
Search("search-service"),
Index("index-service"),
Query("query-service"),
Executor("executor-service"),
Control("control-service"),

View File

@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.util.NamedExecutorFactory;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.function.Function;
@Singleton
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
private final NodeConfigurationWatcher nodeConfigurationWatcher;
private final ServiceRegistryIf serviceRegistryIf;
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final Executor executor = useLoom
? Executors.newVirtualThreadPerTaskExecutor()
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final Executor offloadExecutor = useLoom
? Executors.newVirtualThreadPerTaskExecutor()
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
@Inject
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,

View File

@@ -2,6 +2,7 @@ package nu.marginalia.service.client;
import com.google.common.collect.Sets;
import io.grpc.ManagedChannel;
import io.grpc.StatusRuntimeException;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
import nu.marginalia.service.discovery.property.PartitionTraits;
@@ -206,6 +207,11 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
}
for (var e : exceptions) {
if (e instanceof StatusRuntimeException se) {
throw se; // Re-throw SRE as-is
}
// If there are other exceptions, log them
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
}

View File

@@ -1,9 +1,9 @@
package nu.marginalia.service.server;
import io.grpc.Server;
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
import io.grpc.netty.shaded.io.netty.channel.nio.NioEventLoopGroup;
import io.grpc.netty.shaded.io.netty.channel.socket.nio.NioServerSocketChannel;
import io.grpc.netty.NettyServerBuilder;
import io.netty.channel.nio.NioEventLoopGroup;
import io.netty.channel.socket.nio.NioServerSocketChannel;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition;
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class GrpcServer {
private final Server server;
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
public GrpcServer(ServiceConfiguration config,
ServiceRegistryIf serviceRegistry,
ServicePartition partition,
@@ -26,13 +31,19 @@ public class GrpcServer {
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
// Start the gRPC server
ExecutorService workExecutor = useLoom ?
Executors.newVirtualThreadPerTaskExecutor() :
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
.executor(workExecutor)
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
.channelType(NioServerSocketChannel.class);
for (var grpcService : grpcServices) {
if (!grpcService.shouldRegisterService()) {
continue;
}

View File

@@ -125,8 +125,7 @@ public class JoobyService {
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
// scenario
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
options.setWorkerThreads(Math.min(16, options.getWorkerThreads()));
jooby.setServerOptions(options);

View File

@@ -189,7 +189,7 @@ public class ExecutorClient {
String uriPath = "/transfer/file/" + fileStorage.id();
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
if (endpoints.isEmpty()) {
throw new RuntimeException("No endpoints for node " + fileStorage.node());
}

View File

@@ -47,6 +47,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
public record Initial() implements ActorStep {}
@Resume(behavior = ActorResumeBehavior.RETRY)
public record Wait(String ts) implements ActorStep {}
@@ -57,6 +59,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Initial() -> {
if (!insertFoundDomains) yield new Error("Domain insertion prohibited, aborting");
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
yield new Error("Invalid node profile for RSS update");
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.execution;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.actor.ExecutorActor;
import nu.marginalia.actor.ExecutorActorControlService;
@@ -36,7 +37,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -52,7 +53,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -66,7 +67,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -80,7 +81,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -98,7 +99,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}

View File

@@ -2,6 +2,7 @@ package nu.marginalia.execution;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.actor.ExecutorActor;
import nu.marginalia.actor.ExecutorActorControlService;
@@ -38,7 +39,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -57,7 +58,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -73,7 +74,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -87,7 +88,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -99,7 +100,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -114,14 +115,14 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
if (serviceConfiguration.node() != 1) {
responseObserver.onError(new IllegalArgumentException("Export all atags is only available on node 1"));
responseObserver.onError(Status.UNAVAILABLE.withDescription("Export all atags is only available on node 1").asRuntimeException());
}
try {
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
@@ -131,7 +132,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -145,7 +146,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -159,7 +160,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.execution;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.WmsaHome;
import nu.marginalia.actor.ActorApi;
@@ -58,7 +59,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -70,7 +71,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -82,7 +83,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -96,7 +97,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -112,7 +113,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -128,7 +129,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -203,7 +204,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -229,7 +230,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -276,7 +277,7 @@ public class ExecutorGrpcService
}
catch (Exception e) {
logger.error("Failed to update nsfw filters", e);
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.execution;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.actor.ExecutorActor;
import nu.marginalia.actor.ExecutorActorControlService;
@@ -33,7 +34,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -48,7 +49,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -63,7 +64,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -78,7 +79,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -93,7 +94,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import com.google.inject.Inject;
import nu.marginalia.storage.FileStorageService;

View File

@@ -1,5 +1,5 @@
The execution subsystem is responsible for the execution of long running tasks on each
index node. It lives in the [executor-service](../services-core/executor-service) module.
index node. It lives in the [index-service](../services-core/index-service) module.
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
which permits program state to survive crashes and reboots.

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.concurrent.*;
import nu.marginalia.api.domains.model.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
@Singleton
public class DomainInfoClient {
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
private final ExecutorService executor = Executors.newWorkStealingPool(8);
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
@Inject
public DomainInfoClient(GrpcChannelPoolFactory factory) {

View File

@@ -0,0 +1,114 @@
package nu.marginalia.api.domsample;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.grpc.Status;
import io.grpc.StatusRuntimeException;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
@Singleton
public class DomSampleClient {
private final GrpcSingleNodeChannelPool<DomSampleApiGrpc.DomSampleApiBlockingStub> channelPool;
private static final Logger logger = LoggerFactory.getLogger(DomSampleClient.class);
@Inject
public DomSampleClient(GrpcChannelPoolFactory factory) {
// The client is only interested in the primary node
var key = ServiceKey.forGrpcApi(DomSampleApiGrpc.class, ServicePartition.any());
this.channelPool = factory.createSingle(key, DomSampleApiGrpc::newBlockingStub);
}
public Optional<RpcDomainSample> getSample(String domainName) {
try {
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSample)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
return Optional.of(val);
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}
}
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
try {
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
return Optional.of(val);
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}
}
public boolean hasSample(String domainName) {
try {
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build())
.getAnswer();
}
catch (StatusRuntimeException sre) {
return false;
}
}
public CompletableFuture<Boolean> hasSample(String domainName, ExecutorService executor) {
try {
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
.async(executor)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build())
.thenApply(RpcBooleanRsp::getAnswer);
}
catch (StatusRuntimeException sre) {
return CompletableFuture.completedFuture(false);
}
}
public CompletableFuture<RpcDomainSample> getSampleAsync(String domainName, ExecutorService executorService) {
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSample)
.async(executorService)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
}
public List<RpcDomainSample> getAllSamples(String domainName) {
try {
Iterator<RpcDomainSample> val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getAllSamples)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
List<RpcDomainSample> ret = new ArrayList<>();
val.forEachRemaining(ret::add);
return ret;
}
catch (StatusRuntimeException sre) {
logger.error("Failed to fetch DOM sample");
return List.of();
}
}
public boolean waitReady(Duration duration) throws InterruptedException {
return channelPool.awaitChannel(duration);
}
}

View File

@@ -24,7 +24,9 @@ import java.util.function.BiConsumer;
@Singleton
public class FeedsClient {
private final ExecutorService executorService = Executors.newCachedThreadPool();
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executorService = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
private final MqOutbox updateFeedsOutbox;

View File

@@ -0,0 +1,47 @@
syntax="proto3";
package nu.marginalia.api.domsample;
option java_package="nu.marginalia.api.domsample";
option java_multiple_files=true;
service DomSampleApi {
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
}
message RpcDomainName {
string domainName = 1;
}
message RpcBooleanRsp {
bool answer = 1;
}
message RpcDomainSampleRequests {
string domainName = 1;
string url = 2;
repeated RpcOutgoingRequest outgoingRequests = 5;
}
message RpcDomainSample {
string domainName = 1;
string url = 2;
bytes htmlSampleZstd = 3;
bool accepted_popover = 4;
repeated RpcOutgoingRequest outgoingRequests = 5;
}
message RpcOutgoingRequest {
RequestMethod method = 1;
int64 timestamp = 2;
string url = 3;
enum RequestMethod {
GET = 0;
POST = 1;
OTHER = 2;
};
}

View File

@@ -31,6 +31,7 @@ dependencies {
implementation libs.jsoup
implementation libs.opencsv
implementation libs.slop
implementation libs.zstd
implementation libs.sqlite
implementation libs.bundles.slf4j
implementation libs.commons.lang3

View File

@@ -0,0 +1,176 @@
package nu.marginalia.domsample;
import com.github.luben.zstd.Zstd;
import com.google.inject.Inject;
import com.google.protobuf.ByteString;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.api.domsample.*;
import nu.marginalia.domsample.db.DomSampleDb;
import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.List;
public class DomSampleGrpcService
extends DomSampleApiGrpc.DomSampleApiImplBase
implements DiscoverableService
{
private static final Logger logger = LoggerFactory.getLogger(DomSampleGrpcService.class);
private final DomSampleDb domSampleDb;
@Inject
public DomSampleGrpcService(DomSampleDb domSampleDb) {
this.domSampleDb = domSampleDb;
}
@Override
public void getSample(RpcDomainName request, StreamObserver<RpcDomainSample> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
if (dbRecords.isEmpty()) {
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
return;
}
// Grab the first sample
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
}
catch (Exception e) {
logger.error("Error in getSample()", e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
if (dbRecords.isEmpty()) {
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
return;
}
// Grab the first sample
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
}
catch (Exception e) {
logger.error("Error in getSample()", e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void hasSample(RpcDomainName request, StreamObserver<RpcBooleanRsp> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
responseObserver.onNext(RpcBooleanRsp.newBuilder()
.setAnswer(domSampleDb.hasSample(domainName)).build());
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void getAllSamples(RpcDomainName request, StreamObserver<RpcDomainSample> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
for (var record : dbRecords) {
responseObserver.onNext(convertFullSample(record).build());
}
responseObserver.onCompleted();
}
catch (Exception e) {
logger.error("Error in getSample()", e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
var sampleBuilder = RpcDomainSample.newBuilder()
.setDomainName(dbSample.domain())
.setAcceptedPopover(dbSample.acceptedPopover())
.setHtmlSampleZstd(htmlZstd);
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
.setUrl(req.uri().toString())
.setMethod(switch (req.method().toUpperCase())
{
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
default -> RpcOutgoingRequest.RequestMethod.OTHER;
})
.setTimestamp(req.timestamp());
}
return sampleBuilder;
}
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
.setDomainName(dbSample.domain());
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
.setUrl(req.uri().toString())
.setMethod(switch (req.method().toUpperCase())
{
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
default -> RpcOutgoingRequest.RequestMethod.OTHER;
})
.setTimestamp(req.timestamp());
}
return sampleBuilder;
}
}

View File

@@ -1,17 +1,28 @@
package nu.marginalia.domsample.db;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
public class DomSampleDb implements AutoCloseable {
private static final String dbFileName = "dom-sample.db";
private final Connection connection;
private static final Logger logger = LoggerFactory.getLogger(DomSampleDb.class);
public DomSampleDb() throws SQLException{
this(WmsaHome.getDataPath().resolve(dbFileName));
@@ -88,7 +99,71 @@ public class DomSampleDb implements AutoCloseable {
}
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {
public List<SampleRequest> parseRequests() {
List<SampleRequest> requests = new ArrayList<>();
// Request format is METHOD\tTIMESTAMP\tURI\n
for (var line : StringUtils.split(this.requests, '\n')) {
String[] parts = StringUtils.split(line, "\t", 3);
if (parts.length != 3) continue;
try {
String method = parts[0];
long ts = Long.parseLong(parts[1]);
String linkUrl = parts[2];
URI uri = parseURI(linkUrl);
requests.add(new SampleRequest(method, ts, uri));
}
catch (Exception e) {
logger.warn("Failed to parse requests", e);
}
}
return requests;
}
private static URI parseURI(String uri) throws URISyntaxException {
try {
return new URI(uri);
}
catch (URISyntaxException ex) {
return new EdgeUrl(uri).asURI();
}
}
}
public record SampleRequest(String method, long timestamp, URI uri) {}
/**
* @param consumer - consume the sample, return true to continue consumption
* @throws SQLException
*/
public void forEachSample(Predicate<Sample> consumer) throws SQLException {
try (var stmt = connection.prepareStatement("""
SELECT url, domain, sample, requests, accepted_popover
FROM samples
"""))
{
var rs = stmt.executeQuery();
while (rs.next()) {
var sample = new Sample(
rs.getString("url"),
rs.getString("domain"),
rs.getString("sample"),
rs.getString("requests"),
rs.getBoolean("accepted_popover")
);
if (!consumer.test(sample)) break;
}
}
}
public List<Sample> getSamples(String domain) throws SQLException {
List<Sample> samples = new ArrayList<>();
@@ -116,6 +191,21 @@ public class DomSampleDb implements AutoCloseable {
return samples;
}
public boolean hasSample(String domain) throws SQLException {
try (var stmt = connection.prepareStatement("""
SELECT 1
FROM samples
WHERE domain = ?
"""))
{
stmt.setString(1, domain);
var rs = stmt.executeQuery();
return rs.next();
}
}
public void saveSample(String domain, String url, String rawContent) throws SQLException {
var doc = Jsoup.parse(rawContent);

View File

@@ -1,6 +1,7 @@
package nu.marginalia.rss.svc;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.api.feeds.*;
import nu.marginalia.db.DbDomainQueries;
@@ -69,7 +70,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
@Override
public void getFeedDataHash(Empty request, StreamObserver<RpcFeedDataHash> responseObserver) {
if (!feedDb.isEnabled()) {
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
responseObserver.onError(Status.INTERNAL.withDescription("Feed database is disabled on this node").asRuntimeException());
return;
}
@@ -80,7 +81,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
}
catch (Exception e) {
logger.error("Error getting feed data hash", e);
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -101,7 +102,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
}
catch (Exception e) {
logger.error("Error getting updated links", e);
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -109,13 +110,13 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
public void getFeed(RpcDomainId request,
StreamObserver<RpcFeed> responseObserver) {
if (!feedDb.isEnabled()) {
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
responseObserver.onError(Status.INTERNAL.withDescription("Feed database is disabled on this node").asRuntimeException());
return;
}
Optional<EdgeDomain> domainName = domainQueries.getDomain(request.getDomainId());
if (domainName.isEmpty()) {
responseObserver.onError(new IllegalArgumentException("Domain not found"));
responseObserver.onError(Status.NOT_FOUND.withDescription("Domain not found").asRuntimeException());
return;
}

View File

@@ -87,7 +87,7 @@ class FeedFetcherServiceTest extends AbstractModule {
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
bind(HikariDataSource.class).toInstance(dataSource);
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Index, 1, "", "", 0, UUID.randomUUID()));
bind(Integer.class).annotatedWith(Names.named("wmsa-system-node")).toInstance(1);
}

View File

@@ -26,7 +26,9 @@ public class MathClient {
private static final Logger logger = LoggerFactory.getLogger(MathClient.class);
private final GrpcSingleNodeChannelPool<MathApiGrpc.MathApiBlockingStub> channelPool;
private final ExecutorService executor = Executors.newWorkStealingPool(8);
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
@Inject
public MathClient(GrpcChannelPoolFactory factory) {

View File

@@ -304,7 +304,6 @@ public class QueryProtobufCodec {
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
specs.getDomainsList(),
specs.getSearchSetIdentifier(),
specs.getHumanQuery(),
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
IndexProtobufCodec.convertSpecLimit(specs.getSize()),

View File

@@ -18,8 +18,6 @@ public class SearchSpecification {
public String searchSetIdentifier;
public final String humanQuery;
public SpecificationLimit quality;
public SpecificationLimit year;
public SpecificationLimit size;
@@ -35,7 +33,6 @@ public class SearchSpecification {
public SearchSpecification(SearchQuery query,
List<Integer> domains,
String searchSetIdentifier,
String humanQuery,
SpecificationLimit quality,
SpecificationLimit year,
SpecificationLimit size,
@@ -47,7 +44,6 @@ public class SearchSpecification {
this.query = query;
this.domains = domains;
this.searchSetIdentifier = searchSetIdentifier;
this.humanQuery = humanQuery;
this.quality = quality;
this.year = year;
this.size = size;
@@ -73,10 +69,6 @@ public class SearchSpecification {
return this.searchSetIdentifier;
}
public String getHumanQuery() {
return this.humanQuery;
}
public SpecificationLimit getQuality() {
return this.quality;
}
@@ -106,14 +98,13 @@ public class SearchSpecification {
}
public String toString() {
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
}
public static class SearchSpecificationBuilder {
private SearchQuery query;
private List<Integer> domains;
private String searchSetIdentifier;
private String humanQuery;
private SpecificationLimit quality$value;
private boolean quality$set;
private SpecificationLimit year$value;
@@ -144,11 +135,6 @@ public class SearchSpecification {
return this;
}
public SearchSpecificationBuilder humanQuery(String humanQuery) {
this.humanQuery = humanQuery;
return this;
}
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
this.quality$value = quality;
this.quality$set = true;
@@ -205,11 +191,7 @@ public class SearchSpecification {
if (!this.rank$set) {
rank$value = SpecificationLimit.none();
}
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
}
public String toString() {
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
}
}
}

View File

@@ -1,56 +0,0 @@
package nu.marginalia.api.searchquery.model.results;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import java.util.BitSet;
public class ResultRankingContext {
private final int docCount;
public final RpcResultRankingParameters params;
public final BitSet regularMask;
public final BitSet ngramsMask;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt fullCounts;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt priorityCounts;
public ResultRankingContext(int docCount,
RpcResultRankingParameters params,
BitSet ngramsMask,
BitSet regularMask,
CqDataInt fullCounts,
CqDataInt prioCounts)
{
this.docCount = docCount;
this.params = params;
this.ngramsMask = ngramsMask;
this.regularMask = regularMask;
this.fullCounts = fullCounts;
this.priorityCounts = prioCounts;
}
public int termFreqDocCount() {
return docCount;
}
@Override
public String toString() {
return "ResultRankingContext{" +
"docCount=" + docCount +
", params=" + params +
", regularMask=" + regularMask +
", ngramsMask=" + ngramsMask +
", fullCounts=" + fullCounts +
", priorityCounts=" + priorityCounts +
'}';
}
}

View File

@@ -34,8 +34,6 @@ public class QueryFactory {
this.queryExpansion = queryExpansion;
}
public ProcessedQuery createQuery(QueryParams params,
@Nullable RpcResultRankingParameters rankingParams) {
final var query = params.humanQuery();
@@ -153,7 +151,6 @@ public class QueryFactory {
var specsBuilder = SearchSpecification.builder()
.query(queryBuilder.build())
.humanQuery(query)
.quality(qualityLimit)
.year(year)
.size(size)

View File

@@ -3,6 +3,7 @@ package nu.marginalia.functions.searchquery;
import com.google.common.collect.Lists;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import io.prometheus.client.Histogram;
import nu.marginalia.api.searchquery.*;
@@ -93,7 +94,7 @@ public class QueryGRPCService
});
} catch (Exception e) {
logger.error("Exception", e);
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}

View File

@@ -241,7 +241,6 @@ public class QueryFactoryTest {
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
Assertions.assertEquals("\"bob's cars\"", subquery.humanQuery);
}
@Test

View File

@@ -38,7 +38,9 @@ public class IndexClient {
.help("Count of results filtered by NSFW tier")
.register();
private static final ExecutorService executor = Executors.newCachedThreadPool();
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
@Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,

View File

@@ -1,9 +1,10 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.index.forward.spans.IndexSpansReader;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -22,16 +23,15 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
* and a mapping between document identifiers to the index into the
* data array.
* <p/>
* Since the total data is relatively small, this is kept in memory to
* reduce the amount of disk thrashing.
* <p/>
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
*/
public class ForwardIndexReader {
private final LongArray ids;
private final LongArray data;
private final ForwardIndexSpansReader spansReader;
private volatile Long2IntOpenHashMap idsMap;
private final IndexSpansReader spansReader;
private final Logger logger = LoggerFactory.getLogger(getClass());
@@ -64,7 +64,18 @@ public class ForwardIndexReader {
ids = loadIds(idsFile);
data = loadData(dataFile);
spansReader = new ForwardIndexSpansReader(spansFile);
spansReader = IndexSpansReader.open(spansFile);
Thread.ofPlatform().start(this::createIdsMap);
}
private void createIdsMap() {
Long2IntOpenHashMap idsMap = new Long2IntOpenHashMap((int) ids.size());
for (int i = 0; i < ids.size(); i++) {
idsMap.put(ids.get(i), i);
}
this.idsMap = idsMap;
}
private static LongArray loadIds(Path idsFile) throws IOException {
@@ -106,6 +117,10 @@ public class ForwardIndexReader {
private int idxForDoc(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
if (idsMap != null) {
return idsMap.getOrDefault(docId, -1);
}
long offset = ids.binarySearch(docId, 0, ids.size());
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
@@ -134,6 +149,27 @@ public class ForwardIndexReader {
}
public DocumentSpans[] getDocumentSpans(Arena arena, long[] docIds) {
long[] offsets = new long[docIds.length];
for (int i = 0; i < docIds.length; i++) {
long offset = idxForDoc(docIds[i]);
if (offset >= 0) {
offsets[i] = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
}
else {
offsets[i] = -1;
}
}
try {
return spansReader.readSpans(arena, offsets);
}
catch (IOException ex) {
logger.error("Failed to read spans for docIds", ex);
return new DocumentSpans[docIds.length];
}
}
public int totalDocCount() {
return (int) ids.size();
}
@@ -141,6 +177,8 @@ public class ForwardIndexReader {
public void close() {
if (data != null)
data.close();
if (ids != null)
ids.close();
}
public boolean isLoaded() {

View File

@@ -5,7 +5,7 @@ import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexParameters;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.index.forward.spans.IndexSpansWriter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
@@ -65,7 +65,7 @@ public class ForwardIndexConverter {
logger.info("Domain Rankings size = {}", domainRankings.size());
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter");
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
var spansWriter = new IndexSpansWriter(outputFileSpansData)
) {
progress.progress(TaskSteps.GET_DOC_IDS);

View File

@@ -11,6 +11,9 @@ public class DocumentSpan {
/** A list of the interlaced start and end positions of each span in the document of this type */
private final IntList startsEnds;
public DocumentSpan(IntList startsEnds) {
this.startsEnds = startsEnds;
}
public DocumentSpan(CodedSequence startsEnds) {
this.startsEnds = startsEnds.values();
}

View File

@@ -1,5 +1,6 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.CodedSequence;
@@ -39,6 +40,23 @@ public class DocumentSpans {
return EMPTY_SPAN;
}
void accept(byte code, IntList positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);
else if (code == HtmlTag.TITLE.code)
this.title = new DocumentSpan(positions);
else if (code == HtmlTag.NAV.code)
this.nav = new DocumentSpan(positions);
else if (code == HtmlTag.CODE.code)
this.code = new DocumentSpan(positions);
else if (code == HtmlTag.ANCHOR.code)
this.anchor = new DocumentSpan(positions);
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
this.externalLinkText = new DocumentSpan(positions);
else if (code == HtmlTag.BODY.code)
this.body = new DocumentSpan(positions);
}
void accept(byte code, CodedSequence positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);

View File

@@ -0,0 +1,25 @@
package nu.marginalia.index.forward.spans;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Path;
public interface IndexSpansReader extends AutoCloseable {
DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException;
DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException;
static IndexSpansReader open(Path fileName) throws IOException {
int version = SpansCodec.parseSpanFilesFooter(fileName);
if (version == SpansCodec.SpansCodecVersion.COMPRESSED.ordinal()) {
return new IndexSpansReaderCompressed(fileName);
}
else if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
return new IndexSpansReaderPlain(fileName);
}
else {
throw new IllegalArgumentException("Unsupported spans file version: " + version);
}
}
void close() throws IOException;
}

View File

@@ -10,11 +10,11 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
@SuppressWarnings("preview")
public class ForwardIndexSpansReader implements AutoCloseable {
@Deprecated
public class IndexSpansReaderCompressed implements AutoCloseable, IndexSpansReader {
private final FileChannel spansFileChannel;
public ForwardIndexSpansReader(Path spansFile) throws IOException {
public IndexSpansReaderCompressed(Path spansFile) throws IOException {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
}
@@ -51,6 +51,17 @@ public class ForwardIndexSpansReader implements AutoCloseable {
return ret;
}
@Override
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
for (int i = 0; i < encodedOffsets.length; i++) {
if (encodedOffsets[i] >= 0) {
ret[i] = readSpans(arena, encodedOffsets[i]);
}
}
return ret;
}
@Override
public void close() throws IOException {
spansFileChannel.close();

View File

@@ -0,0 +1,122 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ForkJoinPool;
public class IndexSpansReaderPlain implements IndexSpansReader {
private final FileChannel[] spansFileChannels;
private final ForkJoinPool forkJoinPool;
public IndexSpansReaderPlain(Path spansFile) throws IOException {
this.spansFileChannels = new FileChannel[8];
for (int i = 0; i < spansFileChannels.length; i++) {
spansFileChannels[i] = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
}
forkJoinPool = new ForkJoinPool(spansFileChannels.length);
}
@Override
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
// Decode the size and offset from the encoded offset
long size = SpansCodec.decodeSize(encodedOffset);
long offset = SpansCodec.decodeStartOffset(encodedOffset);
var ms = arena.allocate(size, 4);
// Allocate a buffer from the arena
var buffer = ms.asByteBuffer();
while (buffer.hasRemaining()) {
spansFileChannels[0].read(buffer, offset + buffer.position());
}
return decode(ms);
}
public DocumentSpans decode(MemorySegment ms) {
int count = ms.get(ValueLayout.JAVA_INT, 0);
int pos = 4;
DocumentSpans ret = new DocumentSpans();
// Decode each span
for (int spanIdx = 0; spanIdx < count; spanIdx++) {
byte code = ms.get(ValueLayout.JAVA_BYTE, pos);
short len = ms.get(ValueLayout.JAVA_SHORT, pos+2);
IntArrayList values = new IntArrayList(len);
pos += 4;
for (int i = 0; i < len; i++) {
values.add(ms.get(ValueLayout.JAVA_INT, pos + 4*i));
}
ret.accept(code, values);
pos += 4*len;
}
return ret;
}
@Override
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
long totalSize = 0;
int numJobs = 0;
for (long offset : encodedOffsets) {
if (offset < 0)
continue;
totalSize += SpansCodec.decodeSize(offset);
numJobs++;
}
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
if (numJobs == 0) return ret;
CountDownLatch latch = new CountDownLatch(numJobs);
MemorySegment segment = arena.allocate(totalSize, 8);
long bufferOffset = 0;
for (int idx = 0; idx < encodedOffsets.length; idx++) {
long size = SpansCodec.decodeSize(encodedOffsets[idx]);
long start = SpansCodec.decodeStartOffset(encodedOffsets[idx]);
MemorySegment slice = segment.asSlice(bufferOffset, size);
bufferOffset += size;
int i = idx;
forkJoinPool.execute(() -> {
var buffer = slice.asByteBuffer();
try {
spansFileChannels[i% spansFileChannels.length].read(buffer, start);
ret[i] = decode(slice);
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
finally {
latch.countDown();
}
});
}
try {
latch.await();
}
catch (InterruptedException ex) {
Thread.currentThread().interrupt();
}
return ret;
}
@Override
public void close() throws IOException {
for (var spansFileChannel : spansFileChannels) {
spansFileChannel.close();
}
}
}

View File

@@ -1,20 +1,23 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.sequence.VarintCodedSequence;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class ForwardIndexSpansWriter implements AutoCloseable {
public class IndexSpansWriter implements AutoCloseable {
private final FileChannel outputChannel;
private final ByteBuffer work = ByteBuffer.allocate(32);
private final ByteBuffer work = ByteBuffer.allocate(65536).order(ByteOrder.nativeOrder());
private long stateStartOffset = -1;
private int stateLength = -1;
public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException {
public IndexSpansWriter(Path outputFileSpansData) throws IOException {
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
}
@@ -23,7 +26,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
stateLength = 0;
work.clear();
work.put((byte) count);
work.putInt(count);
work.flip();
while (work.hasRemaining())
@@ -33,12 +36,17 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
work.clear();
work.put(spanCode);
work.putShort((short) sequenceData.remaining());
work.put((byte) 0); // Ensure we're byte aligned
var sequence = new VarintCodedSequence(sequenceData);
work.putShort((short) sequence.valueCount());
var iter = sequence.iterator();
while (iter.hasNext()) {
work.putInt(iter.nextInt());
}
work.flip();
while (work.hasRemaining() || sequenceData.hasRemaining()) {
stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData});
}
stateLength += outputChannel.write(work);
}
public long endRecord() {
@@ -47,6 +55,11 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
@Override
public void close() throws IOException {
ByteBuffer footer = SpansCodec.createSpanFilesFooter(SpansCodec.SpansCodecVersion.PLAIN);
outputChannel.position(outputChannel.size());
while (footer.hasRemaining()) {
outputChannel.write(footer, outputChannel.size());
}
outputChannel.close();
}
}

View File

@@ -1,6 +1,21 @@
package nu.marginalia.index.forward.spans;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class SpansCodec {
public static int MAGIC_INT = 0xF000F000;
public static int FOOTER_SIZE = 8;
enum SpansCodecVersion {
@Deprecated
COMPRESSED,
PLAIN
}
public static long encode(long startOffset, long size) {
assert size < 0x1000_0000L : "Size must be less than 2^28";
@@ -14,4 +29,31 @@ public class SpansCodec {
public static long decodeSize(long encoded) {
return encoded & 0x0FFF_FFFFL;
}
public static ByteBuffer createSpanFilesFooter(SpansCodecVersion version) {
ByteBuffer footer = ByteBuffer.allocate(FOOTER_SIZE);
footer.putInt(SpansCodec.MAGIC_INT);
footer.put((byte) version.ordinal());
footer.put((byte) 0);
footer.put((byte) 0);
footer.put((byte) 0);
footer.flip();
return footer;
}
public static int parseSpanFilesFooter(Path spansFile) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(FOOTER_SIZE);
try (var fc = FileChannel.open(spansFile, StandardOpenOption.READ)) {
if (fc.size() < FOOTER_SIZE) return 0;
fc.read(buffer, fc.size() - buffer.capacity());
buffer.flip();
int magic = buffer.getInt();
if (magic != MAGIC_INT) {
return 0;
}
return buffer.get();
}
}
}

View File

@@ -1,8 +1,9 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.index.forward.spans.IndexSpansReader;
import nu.marginalia.index.forward.spans.IndexSpansReaderPlain;
import nu.marginalia.index.forward.spans.IndexSpansWriter;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.VarintCodedSequence;
import org.junit.jupiter.api.AfterEach;
@@ -17,10 +18,10 @@ import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.*;
class ForwardIndexSpansReaderTest {
class IndexSpansReaderTest {
Path testFile = Files.createTempFile("test", ".idx");
ForwardIndexSpansReaderTest() throws IOException {
IndexSpansReaderTest() throws IOException {
}
@AfterEach
@@ -34,7 +35,7 @@ class ForwardIndexSpansReaderTest {
long offset1;
long offset2;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
try (var writer = new IndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate(1, 3, 5, 8).buffer());
offset1 = writer.endRecord();
@@ -46,7 +47,7 @@ class ForwardIndexSpansReaderTest {
offset2 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
try (var reader = IndexSpansReader.open(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);
@@ -77,13 +78,13 @@ class ForwardIndexSpansReaderTest {
@Test
void testContainsRange() throws IOException {
long offset1;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
try (var writer = new IndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
offset1 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
try (var reader = new IndexSpansReaderPlain(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);
@@ -104,13 +105,13 @@ class ForwardIndexSpansReaderTest {
@Test
void testContainsRangeExact() throws IOException {
long offset1;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
try (var writer = new IndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
offset1 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
try (var reader = new IndexSpansReaderPlain(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);
@@ -131,13 +132,13 @@ class ForwardIndexSpansReaderTest {
@Test
void testCountRangeMatches() throws IOException {
long offset1;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
try (var writer = new IndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
offset1 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
try (var reader = new IndexSpansReaderPlain(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);

View File

@@ -0,0 +1,53 @@
plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
application {
mainClass = 'nu.marginalia.index.perftest.PerfTestMain'
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:db')
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:common:linkdb')
implementation project(':code:index')
implementation project(':code:index:query')
implementation project(':code:index:index-forward')
implementation project(':code:index:index-reverse')
implementation project(':third-party:commons-codec')
implementation project(':code:functions:search-query')
implementation project(':code:functions:search-query:api')
implementation libs.slop
implementation libs.roaringbitmap
implementation libs.bundles.slf4j
implementation libs.guava
libs.bundles.grpc.get().each {
implementation dependencies.create(it) {
exclude group: 'com.google.guava'
}
}
implementation libs.notnull
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.gson
implementation libs.bundles.mariadb
}

View File

@@ -0,0 +1,334 @@
package nu.marginalia.index.perftest;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.IndexQueryExecution;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.DomainRankingOverrides;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSetAny;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class PerfTestMain {
static Duration warmupTime = Duration.ofMinutes(1);
static Duration runTime = Duration.ofMinutes(10);
public static void main(String[] args) {
if (args.length != 4) {
System.err.println("Arguments: home-dir index-dir query");
System.exit(255);
}
try {
Path indexDir = Paths.get(args[0]);
if (!Files.isDirectory(indexDir)) {
System.err.println("Index directory is not a directory");
System.exit(255);
}
Path homeDir = Paths.get(args[1]);
String scenario = args[2];
String query = args[3];
switch (scenario) {
case "valuation" -> runValuation(indexDir, homeDir, query);
case "lookup" -> runLookup(indexDir, homeDir, query);
case "execution" -> runExecution(indexDir, homeDir, query);
}
}
catch (NumberFormatException e) {
System.err.println("Arguments: data-dir index-dir query");
System.exit(255);
}
catch (Exception ex) {
System.err.println("Error during testing");
ex.printStackTrace();
System.exit(255);
}
System.out.println(Arrays.toString(args));
}
private static CombinedIndexReader createCombinedIndexReader(Path indexDir) throws IOException {
return new CombinedIndexReader(
new ForwardIndexReader(
indexDir.resolve("ir/fwd-doc-id.dat"),
indexDir.resolve("ir/fwd-doc-data.dat"),
indexDir.resolve("ir/fwd-spans.dat")
),
new FullReverseIndexReader(
"full",
indexDir.resolve("ir/rev-words.dat"),
indexDir.resolve("ir/rev-docs.dat"),
new PositionsFileReader(indexDir.resolve("ir/rev-positions.dat"))
),
new PrioReverseIndexReader(
"prio",
indexDir.resolve("ir/rev-prio-words.dat"),
indexDir.resolve("ir/rev-prio-docs.dat")
)
);
}
private static IndexResultRankingService createIndexResultRankingService(Path indexDir, CombinedIndexReader combinedIndexReader) throws IOException, SQLException {
return new IndexResultRankingService(
new DocumentDbReader(indexDir.resolve("ldbr/documents.db")),
new StatefulIndex(combinedIndexReader),
new DomainRankingOverrides(null, Path.of("xxxx"))
);
}
static QueryFactory createQueryFactory(Path homeDir) throws IOException {
return new QueryFactory(
new QueryExpansion(
new TermFrequencyDict(homeDir.resolve("model/tfreq-new-algo3.bin")),
new NgramLexicon()
)
);
}
public static void runValuation(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException
{
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(10_000)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
TLongArrayList allResults = new TLongArrayList();
LongQueryBuffer buffer = new LongQueryBuffer(4096);
for (var query : queries) {
while (query.hasMore() && allResults.size() < 4096 ) {
query.getMoreResults(buffer);
allResults.addAll(buffer.copyData());
}
if (allResults.size() >= 4096)
break;
}
allResults.sort();
if (allResults.size() > 4096) {
allResults.subList(4096, allResults.size()).clear();
}
var docIds = new CombinedDocIdList(allResults.toArray());
var rankingContext = ResultRankingContext.create(indexReader, searchParameters);
System.out.println("Running warmup loop!");
int sum = 0;
Instant runEndTime = Instant.now().plus(warmupTime);
int iter;
IndexSearchBudget budget = new IndexSearchBudget(10000);
for (iter = 0;; iter++) {
sum += rankingService.rankResults(rankingContext, budget, docIds, false).size();
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
break;
}
}
System.out.println("Warmup complete after " + iter + " iters!");
runEndTime = Instant.now().plus(runTime);
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> times = new ArrayList<>();
for (iter = 0;; iter++) {
long start = System.nanoTime();
sum2 += rankingService.rankResults(rankingContext, budget, docIds, false).size();
long end = System.nanoTime();
times.add((end - start)/1_000_000.);
if ((iter % 100) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + (allResults.size() / 4096.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best times: " + (allResults.size() / 4096.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum);
System.out.println("Main sum: " + sum2);
System.out.println(docIds.size());
}
public static void runExecution(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException, InterruptedException {
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(50)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
System.out.println("Running warmup loop!");
int sum = 0;
Instant runEndTime = Instant.now().plus(warmupTime);
int iter;
for (iter = 0;; iter++) {
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
var execution = new IndexQueryExecution(searchParameters, rankingService, indexReader);
execution.run();
sum += execution.itemsProcessed();
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
break;
}
}
System.out.println("Warmup complete after " + iter + " iters!");
runEndTime = Instant.now().plus(runTime);
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> rates = new ArrayList<>();
for (iter = 0;; iter++) {
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
var execution = new IndexQueryExecution(searchParameters, rankingService, indexReader);
long start = System.nanoTime();
execution.run();
long end = System.nanoTime();
sum2 += execution.itemsProcessed();
rates.add(execution.itemsProcessed() / ((end - start)/1_000_000_000.));
if ((iter % 100) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best rates: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best counts: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum);
System.out.println("Main sum: " + sum2);
}
public static void runLookup(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException
{
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(10_000)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
Instant runEndTime = Instant.now().plus(warmupTime);
LongQueryBuffer buffer = new LongQueryBuffer(4096);
int sum1 = 0;
int iter;
for (iter = 0;; iter++) {
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
for (var query : queries) {
while (query.hasMore()) {
query.getMoreResults(buffer);
sum1 += buffer.end;
buffer.reset();
}
}
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
break;
}
}
System.out.println("Warmup complete after " + iter + " iters with sum1 = " + sum1);
runEndTime = Instant.now().plus(runTime);
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> times = new ArrayList<>();
for (iter = 0;; iter++) {
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
long start = System.nanoTime();
for (var query : queries) {
while (query.hasMore()) {
query.getMoreResults(buffer);
sum1 += buffer.end;
buffer.reset();
}
}
long end = System.nanoTime();
times.add((end - start)/1_000_000.);
if ((iter % 100) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum1);
System.out.println("Main sum: " + sum2);
}
}

View File

@@ -3,8 +3,8 @@ package nu.marginalia.index;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.query.ReverseIndexRejectFilter;
@@ -161,12 +161,7 @@ public class FullReverseIndexReader {
// Read the size and offset of the position data
var offsets = reader.queryData(docIds, 1);
for (int i = 0; i < docIds.length; i++) {
if (offsets[i] == 0)
continue;
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
}
return ret;
return positionsFileReader.getTermData(arena, offsets);
}
public void close() {

View File

@@ -5,39 +5,84 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ForkJoinPool;
/** Reads positions data from the positions file */
public class PositionsFileReader implements AutoCloseable {
private final FileChannel positions;
// We use multiple file channels to avoid reads becoming serialized by the kernel.
// If we don't do this, multi-threaded reads become strictly slower than single-threaded reads
// (which is why AsynchronousFileChannel sucks).
// This is likely the best option apart from O_DIRECT or FFI:ing in libaio or io_uring.
private final FileChannel[] positions;
private final ForkJoinPool forkJoinPool;
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
public PositionsFileReader(Path positionsFile) throws IOException {
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
this(positionsFile, 8);
}
/** Get the positions for a term in the index, as pointed out by the encoded offset;
* intermediate buffers are allocated from the provided arena allocator. */
public TermData getTermData(Arena arena, long sizeEncodedOffset) {
int length = PositionCodec.decodeSize(sizeEncodedOffset);
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
var segment = arena.allocate(length);
var buffer = segment.asByteBuffer();
try {
positions.read(buffer, offset);
} catch (IOException e) {
throw new RuntimeException(e);
public PositionsFileReader(Path positionsFile, int nreaders) throws IOException {
positions = new FileChannel[nreaders];
for (int i = 0; i < positions.length; i++) {
positions[i] = FileChannel.open(positionsFile, StandardOpenOption.READ);
}
return new TermData(buffer);
forkJoinPool = new ForkJoinPool(nreaders);
}
@Override
public void close() throws IOException {
positions.close();
for (FileChannel fc : positions) {
fc.close();
}
forkJoinPool.close();
}
/** Get the positions for a keywords in the index, as pointed out by the encoded offsets;
* intermediate buffers are allocated from the provided arena allocator. */
public TermData[] getTermData(Arena arena, long[] offsets) {
TermData[] ret = new TermData[offsets.length];
int tasks = 0;
for (long l : offsets) if (l != 0) tasks++;
CountDownLatch cl = new CountDownLatch(tasks);
for (int i = 0; i < offsets.length; i++) {
long encodedOffset = offsets[i];
if (encodedOffset == 0) continue;
int idx = i;
int length = PositionCodec.decodeSize(encodedOffset);
long offset = PositionCodec.decodeOffset(encodedOffset);
ByteBuffer buffer = arena.allocate(length).asByteBuffer();
forkJoinPool.execute(() -> {
try {
positions[idx % positions.length].read(buffer, offset);
ret[idx] = new TermData(buffer);
cl.countDown();
}
catch (IOException ex) {
logger.error("Failed to read positions file", ex);
}
});
}
try {
cl.await();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
return ret;
}
}

View File

@@ -11,7 +11,6 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -32,7 +31,6 @@ class PositionsFileReaderTest {
@Test
void getTermData() throws IOException {
ByteBuffer workArea = ByteBuffer.allocate(8192);
long key1, key2, key3;
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
key1 = constructor.add((byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
@@ -44,20 +42,19 @@ class PositionsFileReaderTest {
System.out.println("key2: " + Long.toHexString(key2));
System.out.println("key3: " + Long.toHexString(key3));
try (Arena arena = Arena.ofConfined();
try (Arena arena = Arena.ofShared();
PositionsFileReader reader = new PositionsFileReader(file))
{
TermData data1 = reader.getTermData(arena, key1);
assertEquals(43, data1.flags());
assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
TermData[] data = reader.getTermData(arena, new long[] { key1, key2, key3 });
TermData data2 = reader.getTermData(arena, key2);
assertEquals(51, data2.flags());
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
assertEquals(43, data[0].flags());
assertEquals(IntList.of( 1, 2, 3), data[0].positions().values());
TermData data3 = reader.getTermData(arena, key3);
assertEquals(61, data3.flags());
assertEquals(IntList.of(3, 5, 7), data3.positions().values());
assertEquals(51, data[1].flags());
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data[1].positions().values());
assertEquals(61, data[2].flags());
assertEquals(IntList.of(3, 5, 7), data[2].positions().values());
}
}
}

View File

@@ -1,10 +1,10 @@
package nu.marginalia.ranking.domains;
package nu.marginalia.domainranking;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.ranking.domains.accumulator.RankingResultAccumulator;
import nu.marginalia.ranking.domains.data.GraphSource;
import nu.marginalia.ranking.domains.jgrapht.PersonalizedPageRank;
import nu.marginalia.domainranking.accumulator.RankingResultAccumulator;
import nu.marginalia.domainranking.data.GraphSource;
import nu.marginalia.domainranking.jgrapht.PersonalizedPageRank;
import org.jgrapht.Graph;
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
import org.jgrapht.alg.scoring.PageRank;

View File

@@ -1,6 +1,6 @@
package nu.marginalia.ranking.domains;
package nu.marginalia.domainranking;
import nu.marginalia.ranking.domains.accumulator.RankingResultAccumulator;
import nu.marginalia.domainranking.accumulator.RankingResultAccumulator;
import java.util.function.Supplier;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.accumulator;
package nu.marginalia.domainranking.accumulator;
public interface RankingResultAccumulator<T> {
void add(int domainId, int rank);

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.accumulator;
package nu.marginalia.domainranking.accumulator;
import org.roaringbitmap.RoaringBitmap;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.accumulator;
package nu.marginalia.domainranking.accumulator;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.accumulator;
package nu.marginalia.domainranking.accumulator;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.accumulator;
package nu.marginalia.domainranking.accumulator;
import gnu.trove.list.array.TIntArrayList;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.data;
package nu.marginalia.domainranking.data;
import com.zaxxer.hikari.HikariDataSource;
import org.jgrapht.Graph;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.data;
package nu.marginalia.domainranking.data;
import org.jgrapht.Graph;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.data;
package nu.marginalia.domainranking.data;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.data;
package nu.marginalia.domainranking.data;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.data;
package nu.marginalia.domainranking.data;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.ranking.domains.jgrapht;
package nu.marginalia.domainranking.jgrapht;
/*
* (C) Copyright 2016-2023, by Dimitrios Michail and Contributors.
@@ -21,8 +21,9 @@ package nu.marginalia.ranking.domains.jgrapht;
/* (modified by @vlofgren to add personalization) */
import org.jgrapht.*;
import org.jgrapht.alg.interfaces.*;
import org.jgrapht.Graph;
import org.jgrapht.Graphs;
import org.jgrapht.alg.interfaces.VertexScoringAlgorithm;
import java.util.*;

View File

@@ -2,28 +2,18 @@ package nu.marginalia.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import io.prometheus.client.Counter;
import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.searchset.SmallSearchSet;
@@ -34,14 +24,7 @@ import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.util.BitSet;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
@Singleton
public class IndexGrpcService
@@ -87,23 +70,22 @@ public class IndexGrpcService
private final StatefulIndex statefulIndex;
private final SearchSetsService searchSetsService;
private final IndexResultRankingService resultValuator;
private final IndexResultRankingService rankingService;
private final String nodeName;
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
@Inject
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
StatefulIndex statefulIndex,
SearchSetsService searchSetsService,
IndexResultRankingService resultValuator)
IndexResultRankingService rankingService)
{
var nodeId = serviceConfiguration.node();
this.nodeName = Integer.toString(nodeId);
this.statefulIndex = statefulIndex;
this.searchSetsService = searchSetsService;
this.resultValuator = resultValuator;
this.rankingService = rankingService;
}
// GRPC endpoint
@@ -120,7 +102,13 @@ public class IndexGrpcService
.time(() -> {
// Perform the search
try {
return executeSearch(params);
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
return new IndexQueryExecution(params, rankingService, statefulIndex.get()).run();
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
@@ -148,7 +136,7 @@ public class IndexGrpcService
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
responseObserver.onError(ex);
responseObserver.onError(Status.INTERNAL.withCause(ex).asRuntimeException());
}
}
@@ -156,7 +144,12 @@ public class IndexGrpcService
// exists for test access
public List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
try {
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
return new IndexQueryExecution(new SearchParameters(specsSet, getSearchSet(specsSet)), rankingService, statefulIndex.get()).run();
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
@@ -182,262 +175,6 @@ public class IndexGrpcService
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
}
// accessible for tests
public List<RpcDecoratedResultItem> executeSearch(SearchParameters params) throws Exception {
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
params.compiledQuery,
params.compiledQueryIds);
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
List<RpcDecoratedResultItem> ret = queryExecution.run(params);
wmsa_index_query_exec_block_time
.labels(nodeName)
.set(queryExecution.getBlockTime() / 1000.);
wmsa_index_query_exec_stall_time
.labels(nodeName)
.set(queryExecution.getStallTime() / 1000.);
return ret;
}
/** This class is responsible for ranking the results and adding the best results to the
* resultHeap, which depending on the state of the indexLookup threads may or may not block
*/
private ResultRankingContext createRankingContext(RpcResultRankingParameters rankingParams,
CompiledQuery<String> compiledQuery,
CompiledQueryLong compiledQueryIds)
{
int[] full = new int[compiledQueryIds.size()];
int[] prio = new int[compiledQueryIds.size()];
BitSet ngramsMask = new BitSet(compiledQuery.size());
BitSet regularMask = new BitSet(compiledQuery.size());
var currentIndex = statefulIndex.get();
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
full[idx] = currentIndex.numHits(id);
prio[idx] = currentIndex.numHitsPrio(id);
if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx);
}
else {
regularMask.set(idx);
}
}
return new ResultRankingContext(currentIndex.totalDocCount(),
rankingParams,
ngramsMask,
regularMask,
new CqDataInt(full),
new CqDataInt(prio));
}
/** This class is responsible for executing a search query. It uses a thread pool to
* execute the subqueries and their valuation in parallel. The results are then combined
* into a bounded priority queue, and finally the best results are returned.
*/
private class QueryExecution {
private static final Executor workerPool = Executors.newCachedThreadPool();
/** The queue where the results from the index lookup threads are placed,
* pending ranking by the result ranker threads */
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
= new ArrayBlockingQueue<>(64);
private final ResultPriorityQueue resultHeap;
private final ResultRankingContext resultRankingContext;
private final AtomicInteger remainingIndexTasks = new AtomicInteger(0);
private final AtomicInteger remainingValuationTasks = new AtomicInteger(0);
private final AtomicLong blockTime = new AtomicLong(0);
private final AtomicLong stallTime = new AtomicLong(0);
public long getStallTime() {
return stallTime.get();
}
public long getBlockTime() {
return blockTime.get();
}
private QueryExecution(ResultRankingContext resultRankingContext, int maxResults) {
this.resultRankingContext = resultRankingContext;
this.resultHeap = new ResultPriorityQueue(maxResults);
}
/** Execute a search query */
public List<RpcDecoratedResultItem> run(SearchParameters parameters) throws Exception {
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
var currentIndex = statefulIndex.get();
for (var indexQuery : currentIndex.createQueries(terms, parameters.queryParams)) {
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
}
for (int i = 0; i < indexValuationThreads; i++) {
workerPool.execute(new ResultRanker(parameters, resultRankingContext));
}
// Wait for all tasks to complete
awaitCompletion();
// Return the best results
return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap);
}
/** Wait for all tasks to complete */
private void awaitCompletion() throws InterruptedException {
synchronized (remainingValuationTasks) {
while (remainingValuationTasks.get() > 0) {
remainingValuationTasks.wait(20);
}
}
}
/** This class is responsible for executing a subquery and adding the results to the
* resultCandidateQueue, which depending on the state of the valuator threads may
* or may not block */
class IndexLookup implements Runnable {
private final IndexQuery query;
private final IndexSearchBudget budget;
IndexLookup(IndexQuery query,
IndexSearchBudget budget) {
this.query = query;
this.budget = budget;
remainingIndexTasks.incrementAndGet();
}
public void run() {
try {
executeSearch();
}
catch (Exception ex) {
logger.error("Error in index lookup", ex);
}
finally {
synchronized (remainingIndexTasks) {
if (remainingIndexTasks.decrementAndGet() == 0) {
remainingIndexTasks.notifyAll();
}
}
}
}
private void executeSearch() {
final LongArrayList results = new LongArrayList(16);
// These queries are different indices for one subquery
final LongQueryBuffer buffer = new LongQueryBuffer(4096);
while (query.hasMore() && budget.hasTimeLeft())
{
buffer.reset();
query.getMoreResults(buffer);
for (int i = 0; i < buffer.end; i+=16) {
for (int j = 0; j < Math.min(buffer.end - i, 16); j++) {
results.add(buffer.data.get(i+j));
}
enqueueResults(new CombinedDocIdList(results));
results.clear();
}
}
buffer.dispose();
}
private void enqueueResults(CombinedDocIdList resultIds) {
long remainingTime = budget.timeLeft();
try {
if (!resultCandidateQueue.offer(resultIds)) {
long start = System.currentTimeMillis();
resultCandidateQueue.offer(resultIds, remainingTime, TimeUnit.MILLISECONDS);
blockTime.addAndGet(System.currentTimeMillis() - start);
}
}
catch (InterruptedException e) {
logger.warn("Interrupted while waiting to offer resultIds to queue", e);
}
}
}
class ResultRanker implements Runnable {
private final SearchParameters parameters;
private final ResultRankingContext rankingContext;
ResultRanker(SearchParameters parameters, ResultRankingContext rankingContext) {
this.parameters = parameters;
this.rankingContext = rankingContext;
remainingValuationTasks.incrementAndGet();
}
public void run() {
try {
while (parameters.budget.timeLeft() > 0 && execute());
}
catch (InterruptedException e) {
logger.warn("Interrupted while waiting to poll resultIds from queue", e);
}
catch (Exception e) {
logger.error("Exception while ranking results", e);
}
finally {
synchronized (remainingValuationTasks) {
if (remainingValuationTasks.decrementAndGet() == 0)
remainingValuationTasks.notifyAll();
}
}
}
private boolean execute() throws Exception {
long start = System.currentTimeMillis();
// Do a relatively short poll to ensure we terminate in a timely manner
// in the event all work is done
final long pollTime = Math.clamp(parameters.budget.timeLeft(), 1, 5);
CombinedDocIdList resultIds = resultCandidateQueue.poll(pollTime, TimeUnit.MILLISECONDS);
if (resultIds == null) {
// check if we are done and can terminate
if (remainingIndexTasks.get() == 0 && resultCandidateQueue.isEmpty()) {
return false;
}
}
else {
stallTime.addAndGet(System.currentTimeMillis() - start);
resultHeap.addAll(
resultValuator.rankResults(parameters, false, rankingContext, resultIds)
);
}
return true; // keep going
}
}
}
}

View File

@@ -0,0 +1,137 @@
package nu.marginalia.index;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import java.sql.SQLException;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ForkJoinPool;
/** Performs an index query */
public class IndexQueryExecution {
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
private static final ForkJoinPool lookupPool = new ForkJoinPool(indexValuationThreads);
private static final ForkJoinPool evaluationPool = new ForkJoinPool(indexValuationThreads);
private final IndexResultRankingService rankingService;
private final ResultRankingContext rankingContext;
private final List<IndexQuery> queries;
private final IndexSearchBudget budget;
private final ResultPriorityQueue resultHeap;
private final CountDownLatch executionCountdown;
private final int limitTotal;
private final int limitByDomain;
private int evaluationJobCounter;
public IndexQueryExecution(SearchParameters params,
IndexResultRankingService rankingService,
CombinedIndexReader currentIndex) {
this.rankingService = rankingService;
resultHeap = new ResultPriorityQueue(params.fetchSize);
budget = params.budget;
limitByDomain = params.limitByDomain;
limitTotal = params.limitTotal;
rankingContext = ResultRankingContext.create(currentIndex, params);
queries = currentIndex.createQueries(new SearchTerms(params.query, params.compiledQueryIds), params.queryParams);
executionCountdown = new CountDownLatch(queries.size());
evaluationJobCounter = 0;
}
public List<RpcDecoratedResultItem> run() throws InterruptedException, SQLException {
// Spawn lookup tasks for each query
for (IndexQuery query : queries) {
lookupPool.execute(() -> lookup(query));
}
// Await lookup task termination (this guarantees we're no longer creating new evaluation tasks)
executionCountdown.await();
// Await evaluation task termination
synchronized (IndexQueryExecution.this) {
while (evaluationJobCounter > 0 && budget.hasTimeLeft()) {
IndexQueryExecution.this.wait(budget.timeLeft());
}
}
// Final result selection
return rankingService.selectBestResults(limitByDomain, limitTotal, rankingContext, resultHeap);
}
private void lookup(IndexQuery query) {
final LongQueryBuffer buffer = new LongQueryBuffer(1024);
try {
while (query.hasMore() && budget.hasTimeLeft()) {
buffer.reset();
query.getMoreResults(buffer);
if (buffer.isEmpty())
continue;
CombinedDocIdList docIds = new CombinedDocIdList(buffer);
boolean stealWork = false;
synchronized (IndexQueryExecution.this) {
// Hold off on spawning new evaluation jobs if we have too many queued
// to avoid backpressure, instead steal work into the lookup thread
// in this scenario
if (evaluationJobCounter > indexValuationThreads * 8) {
stealWork = true;
}
else {
evaluationJobCounter++;
}
}
if (stealWork) {
resultHeap.addAll(rankingService.rankResults(rankingContext, budget, docIds, false));
}
else {
// Spawn an evaluation task
evaluationPool.execute(() -> evaluate(docIds));
}
}
} finally {
buffer.dispose();
executionCountdown.countDown();
}
}
private void evaluate(CombinedDocIdList docIds) {
try {
if (!budget.hasTimeLeft())
return;
resultHeap.addAll(rankingService.rankResults(rankingContext, budget, docIds, false));
} finally {
synchronized (IndexQueryExecution.this) {
if (--evaluationJobCounter == 0) {
IndexQueryExecution.this.notifyAll();
}
}
}
}
public int itemsProcessed() {
return resultHeap.getItemsProcessed();
}
}

View File

@@ -1,116 +1,59 @@
package nu.marginalia.index;
import com.google.common.collect.MinMaxPriorityQueue;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import org.jetbrains.annotations.NotNull;
import java.util.*;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
/** A priority queue for search results. This class is not thread-safe,
* in general, except for concurrent use of the addAll method.
* <p></p>
* The class implements a subset of the Collection interface, and
* is intended to be used as a priority queue for search results,
* with a maximum size.
* <p></p>
* Since the expected use case is to add a large number of items
* and then iterate over the items, the class is optimized for
* this scenario, and does not implement other mutating methods
* than addAll().
*/
public class ResultPriorityQueue implements Iterable<SearchResultItem>,
Collection<SearchResultItem> {
private final int limit;
private final ArrayList<SearchResultItem> backingList = new ArrayList<>();
public class ResultPriorityQueue implements Iterable<SearchResultItem> {
private final LongOpenHashSet idsInSet = new LongOpenHashSet();
private final MinMaxPriorityQueue<SearchResultItem> queue;
private int itemsProcessed = 0;
public ResultPriorityQueue(int limit) {
this.limit = limit;
this.queue = MinMaxPriorityQueue.<SearchResultItem>orderedBy(Comparator.naturalOrder()).maximumSize(limit).create();
}
public Iterator<SearchResultItem> iterator() {
return backingList.iterator();
}
@NotNull
@Override
public Object[] toArray() {
return backingList.toArray();
}
@NotNull
@Override
public <T> T[] toArray(@NotNull T[] a) {
return backingList.toArray(a);
}
@Override
public boolean add(SearchResultItem searchResultItem) {
throw new UnsupportedOperationException("Use addAll instead");
}
@Override
public boolean remove(Object o) {
throw new UnsupportedOperationException();
}
@Override
public boolean containsAll(@NotNull Collection<?> c) {
return idsInSet.containsAll(c);
return queue.iterator();
}
/** Adds all items to the queue, and returns true if any items were added.
* This is a thread-safe operation.
*/
@Override
public synchronized boolean addAll(@NotNull Collection<? extends SearchResultItem> items) {
boolean itemsAdded = false;
for (var item: items) {
if (idsInSet.add(item.getDocumentId())) {
backingList.add(item);
itemsAdded = true;
}
}
if (!itemsAdded) {
return false;
}
itemsProcessed+=items.size();
backingList.sort(Comparator.naturalOrder());
if (backingList.size() > limit) {
backingList.subList(limit, backingList.size()).clear();
for (var item : items) {
if (idsInSet.add(item.getDocumentId())) {
queue.add(item);
}
}
return true;
}
@Override
public boolean removeAll(@NotNull Collection<?> c) {
throw new UnsupportedOperationException();
}
@Override
public boolean retainAll(@NotNull Collection<?> c) {
throw new UnsupportedOperationException();
}
@Override
public void clear() {
backingList.clear();
idsInSet.clear();
}
public int size() {
return backingList.size();
return queue.size();
}
public int getItemsProcessed() {
return itemsProcessed;
}
@Override
public boolean isEmpty() {
return backingList.isEmpty();
}
@Override
public boolean contains(Object o) {
return backingList.contains(o);
return queue.isEmpty();
}
}

View File

@@ -205,14 +205,19 @@ public class CombinedIndexReader {
return forwardIndexReader.getDocumentSize(docId);
}
/** Retrieves the document spans for the specified document */
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
return forwardIndexReader.getDocumentSpans(arena, docId);
/** Retrieves the document spans for the specified documents */
public DocumentSpans[] getDocumentSpans(Arena arena, CombinedDocIdList docIds) {
long[] decodedIDs = docIds.array();
for (int i = 0; i < decodedIDs.length; i++) {
decodedIDs[i] = UrlIdCodec.removeRank(decodedIDs[i]);
}
return forwardIndexReader.getDocumentSpans(arena, decodedIDs);
}
/** Close the indexes (this is not done immediately)
* */
public void close() throws InterruptedException {
public void close() {
/* Delay the invocation of close method to allow for a clean shutdown of the service.
*
* This is especially important when using Unsafe-based LongArrays, since we have
@@ -227,7 +232,7 @@ public class CombinedIndexReader {
}
private void delayedCall(Runnable call, Duration delay) throws InterruptedException {
private void delayedCall(Runnable call, Duration delay) {
Thread.ofPlatform().start(() -> {
try {
TimeUnit.SECONDS.sleep(delay.toSeconds());
@@ -248,12 +253,13 @@ public class CombinedIndexReader {
class ParamMatchingQueryFilter implements QueryFilterStepIf {
private final QueryParams params;
private final ForwardIndexReader forwardIndexReader;
private final boolean imposesMetaConstraint;
public ParamMatchingQueryFilter(QueryParams params,
ForwardIndexReader forwardIndexReader)
{
this.params = params;
this.forwardIndexReader = forwardIndexReader;
this.imposesMetaConstraint = params.imposesDomainMetadataConstraint();
}
@Override
@@ -261,12 +267,16 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
long docId = UrlIdCodec.removeRank(combinedId);
int domainId = UrlIdCodec.getDomainId(docId);
long meta = forwardIndexReader.getDocMeta(docId);
if (!validateDomain(domainId, meta)) {
if (!validateDomain(domainId)) {
return false;
}
if (!imposesMetaConstraint) {
return true;
}
long meta = forwardIndexReader.getDocMeta(docId);
if (!validateQuality(meta)) {
return false;
}
@@ -286,8 +296,8 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
return true;
}
private boolean validateDomain(int domainId, long meta) {
return params.searchSet().contains(domainId, meta);
private boolean validateDomain(int domainId) {
return params.searchSet().contains(domainId);
}
private boolean validateQuality(long meta) {

View File

@@ -35,6 +35,13 @@ public class StatefulIndex {
this.eventLog = eventLog;
}
/** For use in testing only */
public StatefulIndex(CombinedIndexReader combinedIndexReader) {
this.combinedIndexReader = combinedIndexReader;
this.servicesFactory = null;
this.eventLog = null;
}
public void init() {
Lock lock = indexReplacementLock.writeLock();

View File

@@ -1,8 +1,9 @@
package nu.marginalia.index.model;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.searchset.SearchSet;
import java.util.Objects;
@@ -41,6 +42,13 @@ public final class QueryParams {
this.queryStrategy = queryStrategy;
}
public boolean imposesDomainMetadataConstraint() {
return qualityLimit.type() != SpecificationLimitType.NONE
|| year.type() != SpecificationLimitType.NONE
|| size.type() != SpecificationLimitType.NONE
|| rank.type() != SpecificationLimitType.NONE;
}
public SpecificationLimit qualityLimit() {
return qualityLimit;
}

View File

@@ -0,0 +1,106 @@
package nu.marginalia.index.model;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.index.index.CombinedIndexReader;
import java.util.BitSet;
public class ResultRankingContext {
private final int docCount;
public final RpcResultRankingParameters params;
public final SearchQuery searchQuery;
public final QueryParams queryParams;
public final CompiledQuery<String> compiledQuery;
public final CompiledQueryLong compiledQueryIds;
public final BitSet regularMask;
public final BitSet ngramsMask;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt fullCounts;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt priorityCounts;
public static ResultRankingContext create(CombinedIndexReader currentIndex, SearchParameters searchParameters) {
var compiledQueryIds = searchParameters.compiledQueryIds;
var compiledQuery = searchParameters.compiledQuery;
int[] full = new int[compiledQueryIds.size()];
int[] prio = new int[compiledQueryIds.size()];
BitSet ngramsMask = new BitSet(compiledQuery.size());
BitSet regularMask = new BitSet(compiledQuery.size());
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
full[idx] = currentIndex.numHits(id);
prio[idx] = currentIndex.numHitsPrio(id);
if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx);
}
else {
regularMask.set(idx);
}
}
return new ResultRankingContext(currentIndex.totalDocCount(),
searchParameters,
compiledQuery,
compiledQueryIds,
ngramsMask,
regularMask,
new CqDataInt(full),
new CqDataInt(prio));
}
public ResultRankingContext(int docCount,
SearchParameters searchParameters,
CompiledQuery<String> compiledQuery,
CompiledQueryLong compiledQueryIds,
BitSet ngramsMask,
BitSet regularMask,
CqDataInt fullCounts,
CqDataInt prioCounts)
{
this.docCount = docCount;
this.searchQuery = searchParameters.query;
this.params = searchParameters.rankingParams;
this.queryParams = searchParameters.queryParams;
this.compiledQuery = compiledQuery;
this.compiledQueryIds = compiledQueryIds;
this.ngramsMask = ngramsMask;
this.regularMask = regularMask;
this.fullCounts = fullCounts;
this.priorityCounts = prioCounts;
}
public int termFreqDocCount() {
return docCount;
}
@Override
public String toString() {
return "ResultRankingContext{" +
"docCount=" + docCount +
", params=" + params +
", regularMask=" + regularMask +
", ngramsMask=" + ngramsMask +
", fullCounts=" + fullCounts +
", priorityCounts=" + priorityCounts +
'}';
}
}

View File

@@ -2,7 +2,7 @@ package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.index.model.ResultRankingContext;
import java.util.BitSet;
import java.util.List;

View File

@@ -12,13 +12,15 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
import nu.marginalia.index.ResultPriorityQueue;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
@@ -32,7 +34,10 @@ import org.slf4j.LoggerFactory;
import java.lang.foreign.Arena;
import java.sql.SQLException;
import java.util.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Singleton
public class IndexResultRankingService {
@@ -52,15 +57,16 @@ public class IndexResultRankingService {
this.domainRankingOverrides = domainRankingOverrides;
}
public List<SearchResultItem> rankResults(SearchParameters params,
boolean exportDebugData,
public List<SearchResultItem> rankResults(
ResultRankingContext rankingContext,
CombinedDocIdList resultIds)
IndexSearchBudget budget,
CombinedDocIdList resultIds,
boolean exportDebugData)
{
if (resultIds.isEmpty())
return List.of();
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext, params);
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext);
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
@@ -68,13 +74,11 @@ public class IndexResultRankingService {
// this may change during the calculation, but we don't want to switch over mid-calculation
final CombinedIndexReader currentIndex = statefulIndex.get();
final QuerySearchTerms searchTerms = getSearchTerms(params.compiledQuery, params.query);
final QuerySearchTerms searchTerms = getSearchTerms(rankingContext.compiledQuery, rankingContext.searchQuery);
final int termCount = searchTerms.termIdsAll.size();
// We use an arena for the position data to avoid gc pressure
// from the gamma coded sequences, which can be large and have a lifetime
// that matches the try block here
try (var arena = Arena.ofConfined()) {
// We use an arena for the position and spans data to limit gc pressure
try (var arena = Arena.ofShared()) {
TermMetadataList[] termsForDocs = new TermMetadataList[termCount];
for (int ti = 0; ti < termCount; ti++) {
@@ -87,11 +91,12 @@ public class IndexResultRankingService {
long[] flags = new long[termCount];
CodedSequence[] positions = new CodedSequence[termCount];
DocumentSpans[] documentSpans = currentIndex.getDocumentSpans(arena, resultIds);
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
// term data arrays as well
for (int i = 0; i < resultIds.size(); i++) {
for (int i = 0; i < resultIds.size() && budget.hasTimeLeft(); i++) {
// Prepare term-level data for the document
for (int ti = 0; ti < flags.length; ti++) {
@@ -109,14 +114,15 @@ public class IndexResultRankingService {
}
if (!exportDebugData) {
var score = resultRanker.calculateScore(arena, null, resultIds.at(i), searchTerms, flags, positions);
var score = resultRanker.calculateScore(null, resultIds.at(i), searchTerms, flags, positions, documentSpans[i]);
if (score != null) {
results.add(score);
}
}
else {
var rankingFactors = new DebugRankingFactors();
var score = resultRanker.calculateScore(arena, rankingFactors, resultIds.at(i), searchTerms, flags, positions);
var score = resultRanker.calculateScore( rankingFactors, resultIds.at(i), searchTerms, flags, positions, documentSpans[i]);
if (score != null) {
score.debugRankingFactors = rankingFactors;
results.add(score);
@@ -129,19 +135,20 @@ public class IndexResultRankingService {
}
public List<RpcDecoratedResultItem> selectBestResults(SearchParameters params,
public List<RpcDecoratedResultItem> selectBestResults(int limitByDomain,
int limitTotal,
ResultRankingContext resultRankingContext,
Collection<SearchResultItem> results) throws SQLException {
ResultPriorityQueue results) throws SQLException {
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
TLongList idsList = new TLongArrayList(params.limitTotal);
TLongList idsList = new TLongArrayList(limitTotal);
for (var item : results) {
if (domainCountFilter.test(item)) {
if (resultsList.size() < params.limitTotal) {
if (resultsList.size() < limitTotal) {
resultsList.add(item);
idsList.add(item.getDocumentId());
}
@@ -159,18 +166,18 @@ public class IndexResultRankingService {
// for the selected results, as this would be comically expensive to do for all the results we
// discard along the way
if (params.rankingParams.getExportDebugData()) {
if (resultRankingContext.params.getExportDebugData()) {
var combinedIdsList = new LongArrayList(resultsList.size());
for (var item : resultsList) {
combinedIdsList.add(item.combinedId);
}
resultsList.clear();
IndexSearchBudget budget = new IndexSearchBudget(10000);
resultsList.addAll(this.rankResults(
params,
true,
resultRankingContext,
new CombinedDocIdList(combinedIdsList))
budget, new CombinedDocIdList(combinedIdsList),
true)
);
}
@@ -247,7 +254,7 @@ public class IndexResultRankingService {
var termOutputs = RpcResultTermRankingOutputs.newBuilder();
CqDataLong termIds = params.compiledQueryIds.data;;
CqDataLong termIds = resultRankingContext.compiledQueryIds.data;
for (var entry : debugFactors.getTermFactors()) {
String term = "[ERROR IN LOOKUP]";
@@ -255,7 +262,7 @@ public class IndexResultRankingService {
// CURSED: This is a linear search, but the number of terms is small, and it's in a debug path
for (int i = 0; i < termIds.size(); i++) {
if (termIds.get(i) == entry.termId()) {
term = params.compiledQuery.at(i);
term = resultRankingContext.compiledQuery.at(i);
break;
}
}

View File

@@ -6,14 +6,13 @@ import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.QuerySearchTerms;
@@ -28,7 +27,6 @@ import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable;
import java.lang.foreign.Arena;
import java.util.BitSet;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
@@ -47,24 +45,23 @@ public class IndexResultScoreCalculator {
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
DomainRankingOverrides domainRankingOverrides,
ResultRankingContext rankingContext,
SearchParameters params)
ResultRankingContext rankingContext)
{
this.index = statefulIndex.get();
this.domainRankingOverrides = domainRankingOverrides;
this.rankingContext = rankingContext;
this.queryParams = params.queryParams;
this.compiledQuery = params.compiledQuery;
this.queryParams = rankingContext.queryParams;
this.compiledQuery = rankingContext.compiledQuery;
}
@Nullable
public SearchResultItem calculateScore(Arena arena,
@Nullable DebugRankingFactors debugRankingFactors,
public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
long combinedId,
QuerySearchTerms searchTerms,
long[] wordFlags,
CodedSequence[] positions)
CodedSequence[] positions,
DocumentSpans spans)
{
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
@@ -92,8 +89,6 @@ public class IndexResultScoreCalculator {
int docSize = index.getDocumentSize(docId);
if (docSize <= 0) docSize = 5000;
DocumentSpans spans = index.getDocumentSpans(arena, docId);
if (debugRankingFactors != null) {
debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
@@ -235,7 +230,7 @@ public class IndexResultScoreCalculator {
long result = 0;
int bit = 0;
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions).intIterator();
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions, 64).intIterator();
while (intersection.hasNext() && bit < 64) {
bit = (int) (Math.sqrt(intersection.nextInt()));
@@ -551,9 +546,18 @@ public class IndexResultScoreCalculator {
largeSiteFactor = 2;
}
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.ADVERTISEMENT.getFeatureBit()))
penalty += 7.5 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.CONSENT.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.POPOVER.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
penalty += 5.0 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
penalty += 5.0 * largeSiteFactor;
@@ -563,6 +567,9 @@ public class IndexResultScoreCalculator {
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.SHORT_DOCUMENT.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (isForum || isWiki) {
penalty = Math.min(0, penalty - 2);
}

View File

@@ -3,7 +3,7 @@ package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.model.idx.WordFlags;
import java.util.List;

View File

@@ -58,6 +58,7 @@ public class PhraseConstraintGroupList {
private final int[] offsets;
private final BitSet present;
private final BitSet termIdsMask;
private final int presentCardinality;
public final int size;
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
@@ -85,6 +86,8 @@ public class PhraseConstraintGroupList {
termIdsMask.set(idx);
}
}
presentCardinality = present.cardinality();
}
/** Returns true if the term with index termIdx in the query is in the group */
@@ -93,7 +96,7 @@ public class PhraseConstraintGroupList {
}
public boolean test(CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];
IntIterator[] sequences = new IntIterator[presentCardinality];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
@@ -120,7 +123,7 @@ public class PhraseConstraintGroupList {
public IntList findIntersections(IntList[] positions) {
IntList[] sequences = new IntList[present.cardinality()];
IntList[] sequences = new IntList[presentCardinality];
int[] iterOffsets = new int[sequences.length];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
@@ -144,12 +147,41 @@ public class PhraseConstraintGroupList {
iterOffsets[si - 1] = -oi;
}
return SequenceOperations.findIntersections(sequences, iterOffsets);
return SequenceOperations.findIntersections(sequences, iterOffsets, Integer.MAX_VALUE);
}
public IntList findIntersections(IntList[] positions, int n) {
IntList[] sequences = new IntList[presentCardinality];
int[] iterOffsets = new int[sequences.length];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
continue;
}
int offset = offsets[oi];
if (offset < 0)
return IntList.of();
// Create iterators that are offset by their relative position in the
// sequence. This is done by subtracting the index from the offset,
// so that when we intersect them, an overlap means that the terms are
// in the correct order. Note the offset is negative!
var posForTerm = positions[offset];
if (posForTerm == null) {
return IntList.of();
}
sequences[si++] = posForTerm;
iterOffsets[si - 1] = -oi;
}
return SequenceOperations.findIntersections(sequences, iterOffsets, n);
}
public int minDistance(IntList[] positions) {
List<IntList> sequences = new ArrayList<>(present.cardinality());
IntList iterOffsets = new IntArrayList(present.cardinality());
List<IntList> sequences = new ArrayList<>(presentCardinality);
IntList iterOffsets = new IntArrayList(presentCardinality);
for (int oi = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {

View File

@@ -1,6 +1,7 @@
package nu.marginalia.index.results.model.ids;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.array.page.LongQueryBuffer;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import java.util.Arrays;
@@ -17,7 +18,9 @@ public final class CombinedDocIdList {
public CombinedDocIdList(long... data) {
this.data = Arrays.copyOf(data, data.length);
}
public CombinedDocIdList(LongQueryBuffer buffer) {
this.data = buffer.copyData();
}
public CombinedDocIdList(LongArrayList data) {
this.data = data.toLongArray();
}

View File

@@ -59,7 +59,7 @@ public class RankingSearchSet implements SearchSet {
}
@Override
public boolean contains(int domainId, long documentMetadata) {
public boolean contains(int domainId) {
// This is the main check
if (set.contains(domainId) || set.isEmpty()) {

View File

@@ -7,6 +7,6 @@ public interface SearchSet {
* or if the documentMetadata vibes with the set
*
*/
boolean contains(int domainId, long documentMetadata);
boolean contains(int domainId);
}

View File

@@ -2,7 +2,7 @@ package nu.marginalia.index.searchset;
public class SearchSetAny implements SearchSet {
@Override
public boolean contains(int domainId, long meta) {
public boolean contains(int domainId) {
return true;
}

View File

@@ -6,14 +6,14 @@ import gnu.trove.list.TIntList;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import nu.marginalia.db.DomainRankingSetsService;
import nu.marginalia.db.DomainTypes;
import nu.marginalia.domainranking.PageRankDomainRanker;
import nu.marginalia.domainranking.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.domainranking.accumulator.RankingResultHashSetAccumulator;
import nu.marginalia.domainranking.data.GraphSource;
import nu.marginalia.domainranking.data.LinkGraphSource;
import nu.marginalia.domainranking.data.SimilarityGraphSource;
import nu.marginalia.index.IndexFactory;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.ranking.domains.PageRankDomainRanker;
import nu.marginalia.ranking.domains.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.ranking.domains.accumulator.RankingResultHashSetAccumulator;
import nu.marginalia.ranking.domains.data.GraphSource;
import nu.marginalia.ranking.domains.data.LinkGraphSource;
import nu.marginalia.ranking.domains.data.SimilarityGraphSource;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;

View File

@@ -14,7 +14,7 @@ public class SmallSearchSet implements SearchSet {
}
@Override
public boolean contains(int domainId, long meta) {
public boolean contains(int domainId) {
return entries.contains(domainId);
}

View File

@@ -10,5 +10,5 @@ public class IndexSearchBudget {
}
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
public long timeLeft() { return timeout - System.currentTimeMillis(); }
public long timeLeft() { return Math.max(0, timeout - System.currentTimeMillis()); }
}

View File

@@ -1,6 +1,6 @@
package nu.marginalia.ranking.domains;
package nu.marginalia.domainranking;
import nu.marginalia.ranking.domains.accumulator.RankingResultListAccumulator;
import nu.marginalia.domainranking.accumulator.RankingResultListAccumulator;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

View File

@@ -1,12 +1,12 @@
package nu.marginalia.ranking.domains;
package nu.marginalia.domainranking;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import nu.marginalia.ranking.domains.data.InvertedLinkGraphSource;
import nu.marginalia.ranking.domains.data.LinkGraphSource;
import nu.marginalia.ranking.domains.data.SimilarityGraphSource;
import nu.marginalia.domainranking.data.InvertedLinkGraphSource;
import nu.marginalia.domainranking.data.LinkGraphSource;
import nu.marginalia.domainranking.data.SimilarityGraphSource;
import nu.marginalia.test.TestMigrationLoader;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultWeightedEdge;

View File

@@ -1,7 +1,7 @@
package nu.marginalia.ranking.domains;
package nu.marginalia.domainranking;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ranking.domains.data.GraphSource;
import nu.marginalia.domainranking.data.GraphSource;
import org.apache.commons.lang3.StringUtils;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph;

View File

@@ -1,7 +1,7 @@
package nu.marginalia.ranking.domains;
package nu.marginalia.domainranking;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ranking.domains.data.GraphSource;
import nu.marginalia.domainranking.data.GraphSource;
import org.apache.commons.lang3.StringUtils;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph;

View File

@@ -1,6 +1,6 @@
package nu.marginalia.ranking.domains;
package nu.marginalia.domainranking;
import nu.marginalia.ranking.domains.data.GraphSource;
import nu.marginalia.domainranking.data.GraphSource;
import org.apache.commons.lang3.StringUtils;
import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;

View File

@@ -25,10 +25,10 @@ class RankingSearchSetTest {
set.write();
RankingSearchSet set2 = new RankingSearchSet("ACADEMIA", p);
assertTrue(set2.contains(1, 0));
assertTrue(set2.contains(5, 0));
assertTrue(set2.contains(7, 0));
assertTrue(set2.contains(9, 0));
assertTrue(set2.contains(1));
assertTrue(set2.contains(5));
assertTrue(set2.contains(7));
assertTrue(set2.contains(9));
Files.delete(p);

View File

@@ -56,7 +56,7 @@ public class SequenceOperations {
* <p></p>
*/
public static IntList findIntersections(IntList... positions) {
return findIntersections(positions, new int[positions.length]);
return findIntersections(positions, new int[positions.length], Integer.MAX_VALUE);
}
/** Find any intersections between the given positions lists, and return the list of intersections.
@@ -67,53 +67,80 @@ public class SequenceOperations {
* @param positions the positions lists to compare - each list must be sorted in ascending order
* and contain unique values.
* @param offsets constant offsets to apply to each position
* @param n maximum number of intersections we're interested in. The algorithm does not guarantee
* the return value will have a smaller size than this if it is cheaper to return back e.g.
* an input list.
* */
public static IntList findIntersections(IntList[] positions, int[] offsets) {
public static IntList findIntersections(IntList[] positions, int[] offsets, int n) {
if (positions.length < 1)
// Trivial cases
if (positions.length < 1) { // n = 0
return IntList.of();
}
// else if (positions.length == 1) { // n = 1
// if (offsets[0] == 0) { // with zero offset, we'll just return the input back
// return positions[0];
// }
//
// // Calculate an offset input array
// IntList ret = new IntArrayList(positions[0].size());
// for (int i = 0; i < positions[0].size() && i < n; i++) {
// ret.add(positions[0].getInt(i) + offsets[0]);
// }
// return ret;
// }
int[] indexes = new int[positions.length];
// Initialize values and find the maximum value
int[] values = new int[positions.length];
int minLength = Integer.MAX_VALUE;
int largestValue = Integer.MAX_VALUE;
for (int i = 0; i < positions.length; i++) {
minLength = Math.min(minLength, positions[i].size());
if (indexes[i] < positions[i].size())
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
else
return IntList.of();
largestValue = Math.min(largestValue, positions[i].getInt(positions[i].size() - 1) + offsets[i]);
}
// Intersect the sequences by advancing all values smaller than the maximum seen so far
// until they are equal to the maximum value, or until the end of the sequence is reached
int max = Integer.MIN_VALUE;
int successes = 0;
int currentMax = Integer.MIN_VALUE;
IntList ret = new IntArrayList();
int listMatches = 0;
int foundIntersections = 0;
IntList ret = new IntArrayList(Math.min(n, Math.max(1, minLength)));
outer:
for (int i = 0;; i = (i + 1) % positions.length)
for (int i = 0; currentMax <= largestValue; i = (i + 1) % positions.length)
{
if (successes == positions.length) {
ret.add(max);
successes = 1;
if (listMatches == positions.length) {
ret.add(currentMax);
if (++foundIntersections > n) return ret;
listMatches = 1;
if (indexes[i] < positions[i].size()) {
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
// Update the maximum value, if necessary
max = Math.max(max, values[i]);
currentMax = Math.max(currentMax, values[i]);
} else {
break;
}
} else if (values[i] == max) {
successes++;
} else if (values[i] == currentMax) {
listMatches++;
} else {
successes = 1;
listMatches = 1;
// Discard values until we reach the maximum value seen so far,
// or until the end of the sequence is reached
while (values[i] < max) {
while (values[i] < currentMax) {
if (indexes[i] < positions[i].size()) {
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
} else {
@@ -122,14 +149,13 @@ public class SequenceOperations {
}
// Update the maximum value, if necessary
max = Math.max(max, values[i]);
currentMax = Math.max(currentMax, values[i]);
}
}
return ret;
}
/** Given each set of positions, one from each list, find the set with the smallest distance between them
* and return that distance. If any of the lists are empty, return 0.
* */
@@ -146,10 +172,14 @@ public class SequenceOperations {
public static int minDistance(IntList[] positions, int[] offsets) {
if (positions.length <= 1)
return 0;
if (positions.length == 1)
return 0;
int[] values = new int[positions.length];
int[] indexes = new int[positions.length];
int largestValue = 0;
for (int i = 0; i < positions.length; i++) {
// if any of the lists are empty, return MAX_VALUE
@@ -158,6 +188,7 @@ public class SequenceOperations {
}
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
largestValue = Math.min(largestValue, positions[i].getInt(positions[i].size() - 1) + offsets[i]);
}
int minDist = Integer.MAX_VALUE;
@@ -173,7 +204,7 @@ public class SequenceOperations {
}
}
for (;;) {
do {
// For all the other indexes except maxI, update values[] with the largest value smaller than maxVal
for (int idx = 0; idx < positions.length - 1; idx++) {
int i = (maxI + idx) % positions.length;
@@ -228,6 +259,8 @@ public class SequenceOperations {
else {
return minDist;
}
}
} while (maxVal <= largestValue);
return minDist;
}
}

View File

@@ -2,31 +2,38 @@ package nu.marginalia.bench;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import org.openjdk.jmh.annotations.*;
import java.nio.ByteBuffer;
import java.util.Random;
public class SequenceBenchmarks {
@State(Scope.Benchmark)
public static class SequenceState {
VarintCodedSequence vcs;
GammaCodedSequence gcs;
IntList list;
ByteBuffer workArea;
int[] arrayValues;
int[] valueBuffer;
public SequenceState()
{
valueBuffer = new int[128];
IntList a;
IntList b;
IntList c;
workArea = ByteBuffer.allocate(65536);
arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 };
list = new IntArrayList(arrayValues);
vcs = VarintCodedSequence.generate(16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
gcs = GammaCodedSequence.generate(workArea, 16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
public SequenceState() {
a = new IntArrayList();
b = new IntArrayList();
c = new IntArrayList();
var r = new Random(1000);
for (int i = 0; i < 10; i++) {
b.add(r.nextInt(0, 5000));
}
for (int i = 0; i < 100; i++) {
c.add(r.nextInt(0, 5000));
}
for (int i = 0; i < 1000; i++) {
a.add(r.nextInt(0, 5000));
}
}
}
@@ -34,57 +41,17 @@ public class SequenceBenchmarks {
@Warmup(iterations = 1)
@Benchmark
@BenchmarkMode(Mode.Throughput)
public int vcsDecode(SequenceState state) {
var iter = state.vcs.iterator();
int sum = 0;
while (iter.hasNext()) {
sum += iter.nextInt();
public IntList intersect(SequenceState state) {
return SequenceOperations.findIntersections(state.a, state.b, state.c);
}
return sum;
}
//
// @Fork(value = 5, warmups = 5)
// @Warmup(iterations = 5)
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// public int listDecode2(SequenceState state) {
// var list = state.arrayValues;
// int sum = 0;
// for (int i = 0; i < list.length; i++) {
// sum += list[i];
// }
// return sum;
// }
@Fork(value = 1, warmups = 1)
@Warmup(iterations = 1)
@Benchmark
@BenchmarkMode(Mode.Throughput)
public int gcsDecode(SequenceState state) {
var iter = state.gcs.iterator();
int sum = 0;
while (iter.hasNext()) {
sum += iter.nextInt();
public IntList intersect1(SequenceState state) {
return SequenceOperations.findIntersections(state.a);
}
return sum;
}
// @Fork(value = 1, warmups = 1)
// @Warmup(iterations = 1)
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// public VarintCodedSequence vcsEncode(SequenceState state) {
// return VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100);
// }
// @Fork(value = 1, warmups = 1)
// @Warmup(iterations = 1)
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// public GammaCodedSequence gcsEncode(SequenceState state) {
// return GammaCodedSequence.generate(state.workArea, 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100);
// }
}

View File

@@ -47,11 +47,14 @@ dependencies {
implementation project(':code:processes:converting-process:ft-anchor-keywords')
implementation project(':code:processes:converting-process:ft-keyword-extraction')
implementation project(':code:processes:converting-process:ft-dom-classifier')
implementation project(':code:processes:crawling-process:ft-crawl-blocklist')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:processes:crawling-process:ft-content-type')
implementation project(':code:functions:live-capture:api')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:processes:crawling-process:model')
@@ -87,6 +90,7 @@ dependencies {
implementation libs.commons.lang3
implementation libs.commons.compress
implementation libs.sqlite
implementation libs.bundles.grpc
implementation libs.bundles.httpcomponents

View File

@@ -0,0 +1,41 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:functions:live-capture:api')
implementation libs.bundles.slf4j
implementation libs.guava
implementation libs.zstd
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.trove
implementation libs.gson
implementation libs.bundles.protobuf
implementation libs.bundles.mariadb
implementation libs.duckdb
implementation libs.notnull
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@@ -0,0 +1,99 @@
package nu.marginalia.ddtrackergradar;
import com.google.gson.Gson;
import nu.marginalia.WmsaHome;
import nu.marginalia.ddtrackergradar.model.DDGTDomain;
import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
/** Holds tracker metadata from DuckDuckGo's Tracker Radar
* data itself CC-BY-NC-SA 4.0
* */
public class DDGTrackerData {
private final Map<String, DDGTDomain> topDomains = new HashMap<>();
private final Map<String, DDGTDomain> domains = new HashMap<>();
private final Gson gson = GsonFactory.get();
private static final Logger logger = LoggerFactory.getLogger(DDGTrackerData.class);
public DDGTrackerData() {
// Data is assumed to be in ${WMSA_HOME}/data/tracker-radar
// ... do a shallow clone of the repo
// https://github.com/duckduckgo/tracker-radar/
Path dataDir = WmsaHome.getDataPath().resolve("tracker-radar");
if (!Files.exists(dataDir)) {
logger.info("tracker-radar data absent from expected path {}, loading nothing", dataDir);
return;
}
try (var sources = Files.list(dataDir.resolve("domains"))) {
sources.filter(Files::isDirectory).forEach(this::loadDomainDir);
}
catch (IOException e) {
logger.error("Failed to read tracker radar data dir", e);
}
}
/** Tries to fetch available information about tracking coming from the specified domain
*/
public Optional<DDGTDomain> getDomainInfo(String domain) {
return Optional
.ofNullable(topDomains.get(domain))
.or(() -> Optional.ofNullable(domains.get(domain)));
}
/** public for testing */
public void loadDomainDir(Path dir) {
try (var dirContent = Files.list(dir)) {
dirContent
.filter(Files::isRegularFile)
.filter(path -> path.toString().endsWith(".json"))
.forEach(this::loadDomainModel);
}
catch (IOException e) {
logger.error("Error while loading DDGT tracker data", e);
}
}
void loadDomainModel(Path jsonFile) {
try {
var model = gson.fromJson(Files.readString(jsonFile), DDGTDomain.class);
if (model.domain() == null)
return;
if ((model.owner() == null || model.owner().isEmpty())
&& (model.categories() == null || model.categories().isEmpty()))
return;
topDomains.put(model.domain(), model);
domains.put(model.domain(), model);
if (model.subdomains() != null) {
for (String subdomain : model.subdomains()) {
domains.put(subdomain + "." + model.domain(), model);
}
}
}
catch (Exception e) {
logger.error("Error while loading DDGT tracker data", e);
}
}
// Export all classifications in the data set
public Set<String> getAllClassifications() {
Set<String> ret = new HashSet<>();
for (var domain: domains.values()) {
ret.addAll(domain.categories());
}
return ret;
}
}

View File

@@ -0,0 +1,12 @@
package nu.marginalia.ddtrackergradar.model;
import java.util.List;
public record DDGTDomain(
String domain,
DDGTOwner owner,
List<String> categories,
List<String> subdomains
)
{
}

View File

@@ -0,0 +1,10 @@
package nu.marginalia.ddtrackergradar.model;
public record DDGTOwner(String name, String displayName, String privacyPolicy, String url) {
public boolean isEmpty() {
return name == null
&& displayName == null
&& privacyPolicy == null
&& url == null;
}
}

View File

@@ -0,0 +1,25 @@
package nu.marginalia.domclassifier;
import nu.marginalia.model.crawl.HtmlFeature;
import javax.annotation.Nullable;
/**
* Feature classifications for the DOM sample
*/
public enum DomSampleClassification {
ADS(HtmlFeature.ADVERTISEMENT),
TRACKING(HtmlFeature.TRACKING_ADTECH),
CONSENT(HtmlFeature.CONSENT),
POPOVER(HtmlFeature.POPOVER),
THIRD_PARTY_REQUESTS(HtmlFeature.THIRD_PARTY_REQUESTS),
UNCLASSIFIED(HtmlFeature.MISSING_DOM_SAMPLE),
IGNORE(null);
@Nullable
public final HtmlFeature htmlFeature;
DomSampleClassification(@Nullable HtmlFeature feature) {
this.htmlFeature = feature;
}
}

View File

@@ -0,0 +1,177 @@
package nu.marginalia.domclassifier;
import com.github.luben.zstd.ZstdInputStream;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.domsample.RpcDomainSample;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@Singleton
public class DomSampleClassifier {
private static final Logger logger = LoggerFactory.getLogger(DomSampleClassifier.class);
private final List<Map.Entry<Predicate<String>, DomSampleClassification>> regexClassification = new ArrayList<>();
private final Map<String, DomSampleClassification> urlClassification = new HashMap<>();
private final Map<String, DomSampleClassification> topDomainClassification = new HashMap<>();
private final Map<String, DomSampleClassification> fullDomainClassification = new HashMap<>();
@Inject
public DomSampleClassifier() throws ParserConfigurationException, IOException, SAXException {
this(ClassLoader.getSystemResourceAsStream("request-classifier.xml"));
}
public DomSampleClassifier(InputStream specificationXmlData) throws ParserConfigurationException, IOException, SAXException {
Objects.requireNonNull(specificationXmlData, "specificationXmlData is null");
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(specificationXmlData);
NodeList classifierNodes = doc.getElementsByTagName("classifier");
for (int i = 0; i < classifierNodes.getLength(); i++) {
Element classifier = (Element) classifierNodes.item(i);
String target = classifier.getAttribute("target");
String rule = classifier.getAttribute("rule");
String content = classifier.getTextContent().trim();
// Convert rule to Classification enum
DomSampleClassification classification = DomSampleClassification.valueOf(rule.toUpperCase());
// Add to appropriate map based on target
switch (target) {
case "url":
urlClassification.put(content, classification);
break;
case "url-regex":
regexClassification.add(Map.entry(Pattern.compile(content).asPredicate(), classification));
break;
case "top":
topDomainClassification.put(content, classification);
break;
case "domain":
fullDomainClassification.put(content, classification);
break;
default:
throw new IllegalArgumentException("Unknown target type: " + target);
}
}
}
public Set<DomSampleClassification> classifySample(RpcDomainSample sample) {
Set<DomSampleClassification> classifications = new HashSet<>();
// Look at DOM
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
try (var compressedStream = new ZstdInputStream(sample.getHtmlSampleZstd().newInput())) {
String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
var parsedDoc = Jsoup.parse(html);
var fixedElements = parsedDoc.select("*[data-position=fixed]");
if (sample.getAcceptedPopover()) {
classifications.add(DomSampleClassification.POPOVER);
}
else if (!fixedElements.isEmpty()) {
String fixedText = fixedElements.text().toLowerCase();
if (fixedText.contains("cookie") ||
fixedText.contains("subscribe") ||
fixedText.contains("consent") ||
fixedText.contains("newsletter") ||
fixedText.contains("gdpr"))
{
classifications.add(DomSampleClassification.POPOVER);
}
}
}
catch (Exception ex) {
logger.warn("Error when parsing DOM HTML sample", ex);
}
// Classify outgoing requests
for (var req : sample.getOutgoingRequestsList()) {
EdgeUrl url;
try {
url = new EdgeUrl(req.getUrl());
}
catch (URISyntaxException ex) {
continue;
}
if (!url.domain.hasSameTopDomain(sampleDomain)) {
classifications.add(DomSampleClassification.THIRD_PARTY_REQUESTS);
}
var clazz = classifyRequest(url);
if (clazz != DomSampleClassification.IGNORE && clazz != DomSampleClassification.UNCLASSIFIED) {
classifications.add(clazz);
}
}
return classifications;
}
public DomSampleClassification classifyRequest(EdgeUrl edgeUrl) {
StringBuilder pathSb = new StringBuilder(edgeUrl.path);
if (edgeUrl.param != null) {
pathSb.append("?").append(edgeUrl.param);
}
String pathMatchString = pathSb.toString();
String urlDisplayString = edgeUrl.toDisplayString();
for (Map.Entry<Predicate<String>, DomSampleClassification> regexMatcher : regexClassification) {
var matcher = regexMatcher.getKey();
if (matcher.test(pathMatchString) || matcher.test(urlDisplayString)) {
var clazz = regexMatcher.getValue();
if (clazz != DomSampleClassification.IGNORE) {
return clazz;
}
}
}
DomSampleClassification clazz = urlClassification.get(edgeUrl.toDisplayString());
if (clazz != null && clazz != DomSampleClassification.IGNORE) {
return clazz;
}
clazz = fullDomainClassification.get(edgeUrl.domain.toString());
if (clazz != null && clazz != DomSampleClassification.IGNORE) {
return clazz;
}
clazz = topDomainClassification.get(edgeUrl.domain.topDomain);
if (clazz != null && clazz != DomSampleClassification.IGNORE) {
return clazz;
}
return DomSampleClassification.UNCLASSIFIED;
}
}

View File

@@ -0,0 +1,8 @@
Holds a classification model for rendered DOM data and exported network traffic generated by
[functions/live-capture](../../../functions/live-capture).
The model is primarily used in the [converting-process](../../converting-process) but also run in the search UI for inspection purposes.
The traffic classification model is found in [resources/request-classifier.xml](resources/request-classifier.xml).
The code evaluating the model is in [DomSampleClassifier.java](java/nu/marginalia/domclassifier/DomSampleClassifier.java).

View File

@@ -0,0 +1,112 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE rules [
<!ELEMENT rules (classifier*)>
<!ELEMENT classifier (#PCDATA)>
<!ATTLIST classifier
target (url-regex|url|domain|top) #REQUIRED
rule (ads|tracking|consent|ignore) #REQUIRED>
]>
<!-- Contains rules for mapping outgoing requests during DOM Sampling to website classification -->
<rules>
<!-- Regex rules -->
<classifier target="url-regex" rule="tracking">/ads/ga-audiences</classifier>
<classifier target="url-regex" rule="tracking">/google_top_exp.js$</classifier>
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
<classifier target="url-regex" rule="tracking">^/pixel\.gif$</classifier>
<classifier target="url-regex" rule="ads">/pagead/</classifier>
<classifier target="url-regex" rule="ads">/google-ads/</classifier>
<!-- URL classifications TRACKING -->
<classifier target="url" rule="tracking">https://googleads.g.doubleclick.net/pagead/id</classifier>
<classifier target="url" rule="tracking">https://securepubads.g.doubleclick.net/tag/js/gpt.js</classifier>
<classifier target="url" rule="tracking">https://pagead2.googlesyndication.com/ccm/collect</classifier>
<classifier target="url" rule="tracking">https://z-na.amazon-adsystem.com/widgets/onejs</classifier>
<!-- Full domain classifications ADS -->
<classifier target="domain" rule="ads">securepubads.g.doubleclick.net</classifier>
<classifier target="domain" rule="ads">googleads.g.doubleclick.net</classifier>
<!-- Full domain classifications TRACKING -->
<classifier target="domain" rule="tracking">stats.g.doubleclick.net</classifier>
<classifier target="domain" rule="tracking">insight.adsrvr.org</classifier>
<classifier target="domain" rule="tracking">pixel.wp.com</classifier>
<classifier target="domain" rule="tracking">connect.facebook.net</classifier>
<classifier target="domain" rule="tracking">stats.wp.com</classifier>
<classifier target="domain" rule="tracking">track.hubspot.com</classifier>
<classifier target="domain" rule="tracking">analytics.tiktok.com</classifier>
<classifier target="domain" rule="tracking">analytics-ipv6.tiktokw.us</classifier>
<classifier target="domain" rule="tracking">tr6.snapchat.com</classifier>
<classifier target="domain" rule="tracking">tr.snapchat.com</classifier>
<classifier target="domain" rule="tracking">geo-location.prebid.cloud</classifier>
<classifier target="domain" rule="tracking">px.ads.linkedin.com</classifier>
<classifier target="domain" rule="tracking">region1.analytics.google.com</classifier>
<classifier target="domain" rule="tracking">api.hubapi.com</classifier>
<classifier target="domain" rule="tracking">bat.bing.com</classifier>
<classifier target="domain" rule="tracking">bat.bing.net</classifier>
<classifier target="domain" rule="tracking">c.bing.com</classifier>
<classifier target="domain" rule="tracking">c.bing.net</classifier>
<classifier target="domain" rule="tracking">analytics.twitter.com</classifier>
<classifier target="domain" rule="tracking">play.google.com</classifier>
<classifier target="domain" rule="tracking">www.youtube.com</classifier>
<!-- Full domain classifications CONSENT -->
<classifier target="domain" rule="consent">cdnconsents.websitepolicies.com</classifier>
<!-- Top-level domain classifications - ADS -->
<classifier target="top" rule="ads">googlesyndication.com</classifier>
<classifier target="top" rule="ads">amazon-adsystem.com</classifier>
<classifier target="top" rule="ads">smartadserver.com</classifier>
<classifier target="top" rule="ads">googleadservices.com</classifier>
<classifier target="top" rule="ads">prebid.cloud</classifier>
<classifier target="top" rule="ads">pubmine.com</classifier>
<classifier target="top" rule="ads">adtrafficquality.google</classifier>
<classifier target="top" rule="ads">syndicatedsearch.goog</classifier>
<classifier target="top" rule="ads">adsrvr.org</classifier>
<classifier target="top" rule="ads">adnxs.net</classifier>
<classifier target="top" rule="ads">aditude.io</classifier>
<classifier target="top" rule="ads">buysellads.net</classifier>
<!-- Top-level domain classifications - TRACKING -->
<classifier target="top" rule="tracking">plausible.io</classifier>
<classifier target="top" rule="tracking">amplitude.com</classifier>
<classifier target="top" rule="tracking">hsadspixel.net</classifier>
<classifier target="top" rule="tracking">demdex.net</classifier>
<classifier target="top" rule="tracking">omtrdc.net</classifier>
<classifier target="top" rule="tracking">ggpht.com</classifier>
<classifier target="top" rule="tracking">doubleclick.net</classifier>
<classifier target="top" rule="tracking">google.com</classifier>
<classifier target="top" rule="tracking">google.se</classifier>
<classifier target="top" rule="tracking">google-analytics.com</classifier>
<classifier target="top" rule="tracking">googletagmanager.com</classifier>
<classifier target="top" rule="tracking">cloudflareinsights.com</classifier>
<classifier target="top" rule="tracking">branch.io</classifier>
<classifier target="top" rule="tracking">clarity.ms</classifier>
<classifier target="top" rule="tracking">hotjar.com</classifier>
<classifier target="top" rule="tracking">hotjar.io</classifier>
<classifier target="top" rule="tracking">nr-data.net</classifier>
<classifier target="top" rule="tracking">newrelic.com</classifier>
<classifier target="top" rule="tracking">siteimproveanalytics.com</classifier>
<classifier target="top" rule="tracking">siteimproveanalytics.io</classifier>
<classifier target="top" rule="tracking">hs-analytics.net</classifier>
<classifier target="top" rule="tracking">sentry.io</classifier>
<classifier target="top" rule="tracking">hs-scripts.com</classifier>
<classifier target="top" rule="tracking">addtoany.com</classifier>
<classifier target="top" rule="tracking">facebook.com</classifier>
<classifier target="top" rule="tracking">scorecardresearch.com</classifier>
<!-- Top-level domain classifications - CONSENT -->
<classifier target="top" rule="consent">trustarc.com</classifier>
<classifier target="top" rule="consent">truste.com</classifier>
<classifier target="top" rule="consent">onetrust.com</classifier>
<classifier target="top" rule="consent">cookielaw.org</classifier>
<classifier target="top" rule="consent">hs-banner.com</classifier>
<classifier target="top" rule="consent">fundingchoicesmessages.google.com</classifier>
</rules>

View File

@@ -0,0 +1,16 @@
package nu.marginalia.ddtrackergradar;
import org.junit.jupiter.api.Test;
import java.nio.file.Path;
class DDGTrackerDataTest {
@Test
public void testLoad() {
DDGTrackerData data = new DDGTrackerData();
data.loadDomainDir(Path.of("/home/vlofgren/Work/tracker-radar/domains/US/"));
data.getDomainInfo("hotjar.com").ifPresent(System.out::println);
data.getAllClassifications().forEach(System.out::println);
}
}

View File

@@ -11,7 +11,6 @@ import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
import java.util.*;
public class DocumentKeywordsBuilder {
@@ -36,7 +35,7 @@ public class DocumentKeywordsBuilder {
this(1600);
}
public DocumentKeywords build(ByteBuffer workArea) {
public DocumentKeywords build() {
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
@@ -113,6 +112,13 @@ public class DocumentKeywordsBuilder {
newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta));
}
public void addSyntheticTerm(String newWord) {
byte meta = WordFlags.Synthetic.asBit();
wordToMeta.putIfAbsent(newWord, meta);
}
public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>();

View File

@@ -13,7 +13,6 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
@@ -56,7 +55,7 @@ class DocumentKeywordExtractorTest {
new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
);
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
var keywordsBuilt = keywords.build();
Map<String, Byte> flags = new HashMap<>();
Map<String, CodedSequence> positions = new HashMap<>();

Some files were not shown because too many files have changed in this diff Show More