1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

151 Commits

Author SHA1 Message Date
Viktor Lofgren
bc49406881 (build) Compatibility hack debian server 2025-08-11 23:26:53 +02:00
Viktor Lofgren
90325be447 (minor) Fix comments 2025-08-11 23:19:53 +02:00
Viktor Lofgren
dc89587af3 (index) Improve disk locality of the positions data 2025-08-11 21:17:12 +02:00
Viktor Lofgren
7b552afd6b (index) Improve disk locality of the positions data 2025-08-11 20:59:11 +02:00
Viktor Lofgren
73557edc67 (index) Improve disk locality of the positions data 2025-08-11 20:57:32 +02:00
Viktor Lofgren
83919e448a (index) Use O_DIRECT buffered reads for spans 2025-08-11 18:04:25 +02:00
Viktor Lofgren
6f5b75b84d (cleanup) Remove accidentally committed print stmt 2025-08-11 18:04:25 +02:00
Viktor Lofgren
db315e2813 (index) Use O_DIRECT position reads 2025-08-11 18:04:25 +02:00
Viktor Lofgren
e9977e08b7 (index) Block-align positions data
This will make reads more efficient, and possibly pave way for O_DIRECT reads of this data
2025-08-11 14:36:45 +02:00
Viktor Lofgren
1df3757e5f (native) Clean up io_uring code and check in execution queue, currently unused but nifty 2025-08-11 13:54:05 +02:00
Viktor Lofgren
ca283f9684 (native) Clean up native helpers and break them into their own library 2025-08-10 20:55:34 +02:00
Viktor Lofgren
85360e61b2 (index) Grow span writer buffer size
Apparently outlier spans can grow considerably large.
2025-08-10 17:20:38 +02:00
Viktor Lofgren
e2ccff21bc (index) Wait until ranking is finished in query execution 2025-08-09 23:40:30 +02:00
Viktor Lofgren
c5b5b0c699 (index) Permit fast termination of rejection filter execution 2025-08-09 23:36:59 +02:00
Viktor Lofgren
9a65946e22 (uring) Reduce queue size to 2048 to avoid ENOMEM on systems with default ulimits 2025-08-09 20:41:24 +02:00
Viktor Lofgren
1d2ab21e27 (index) Aggregate termdata reads into a single io_uring operation instead of one for each term 2025-08-09 17:43:18 +02:00
Viktor Lofgren
0610cc19ad (index) Fix double close errors 2025-08-09 17:05:38 +02:00
Viktor Lofgren
a676306a7f (skiplist) Fix bugs in seek operations 2025-08-09 17:00:27 +02:00
Viktor Lofgren
8d68cd14fb (skiplist) Even more aggressive forward pointers 2025-08-09 16:11:41 +02:00
Viktor Lofgren
4773c5a52b (index) Backport some changes made during performance evaluations 2025-08-09 15:19:41 +02:00
Viktor Lofgren
74bd562ae4 (index) Move I/O to separate threads to hopefully reduce contention a bit 2025-08-09 15:19:41 +02:00
Viktor Lofgren
c9751287b0 (index) Boost the buffer size used in PrioIndexEntrySource 2025-08-09 01:46:12 +02:00
Viktor Lofgren
5da24e3fc4 (index) Segregate full and priority query ranking 2025-08-09 00:39:31 +02:00
Viktor Lofgren
20a4e86eec (index) Use a confined arena in IndexResultRankingService 2025-08-08 22:08:35 +02:00
Viktor Lofgren
477a184948 (experiment) Allow early termination of include conditions in lookups 2025-08-08 19:12:54 +02:00
Viktor Lofgren
8940ce99db (perf) More statistics in perf testi 2025-08-08 18:57:25 +02:00
Viktor Lofgren
0ac0fa4dca (perf) More statistics in perf testi 2025-08-08 18:56:17 +02:00
Viktor Lofgren
942f15ef14 (skiplist) Use a linear-quadratic forward pointer scheme instead of an exponential 2025-08-08 16:57:15 +02:00
Viktor Lofgren
f668f33d5b (index) Tweaks and optimizations 2025-08-08 15:32:23 +02:00
Viktor Lofgren
6789975cd2 (index) Tweaks and optimizations 2025-08-08 15:30:48 +02:00
Viktor Lofgren
c3ba608776 (index) Split up evaluation tasks 2025-08-08 15:20:33 +02:00
Viktor Lofgren
733d2687fe (skiplist) Roll back the design change that segregated the values associated with documents into a separate file 2025-08-08 14:45:11 +02:00
Viktor Lofgren
f6daac8ed0 (index) MADVISE_RANDOM the index btrees 2025-08-07 21:14:28 +02:00
Viktor Lofgren
c2eeee4a06 (uring) Disable result set combination 2025-08-07 21:13:30 +02:00
Viktor Lofgren
3b0c701df4 (uring) Update uring timeout threshold 2025-08-07 20:13:25 +02:00
Viktor Lofgren
c6fb2db43b (index) Use a more SLA-aware execution scheduler 2025-08-07 20:13:15 +02:00
Viktor Lofgren
9bc8fe05ae (skiplist) Clean up search logic 2025-08-07 19:35:25 +02:00
Viktor Lofgren
440ffcf6f8 (skiplist) Fix bug in intersection-like algorithms 2025-08-07 02:18:14 +02:00
Viktor Lofgren
b07709cc72 (native) Disable expensive debug checks from uring code 2025-08-06 21:05:28 +02:00
Viktor Lofgren
9a6acdcbe0 (skiplist) Tag slow fuzz test as "slow" 2025-08-06 20:59:52 +02:00
Viktor Lofgren
23b9b0bf1b (index) Parametrize skip list block size and buffer pool sizes 2025-08-06 20:59:33 +02:00
Viktor Lofgren
749c8ed954 (pool) Correct buffer pool alignment 2025-08-06 20:56:34 +02:00
Viktor Lofgren
9f4b6939ca (skiplist) Fix condition for truncated block writing 2025-08-06 16:25:53 +02:00
Viktor Lofgren
1d08e44e8d (uring) Fadvise random access for uring buffered reads 2025-08-06 15:54:24 +02:00
Viktor Lofgren
fc2e156e78 (skiplist) Ensure docs file is a multiple BLOCK_SIZE bytes 2025-08-06 15:13:32 +02:00
Viktor Lofgren
5e68a89e9f (index) Improve error handling 2025-08-06 15:05:16 +02:00
Viktor Lofgren
d380661307 (index) Improve error handling 2025-08-06 14:31:06 +02:00
Viktor Lofgren
cccdf5c329 (pool) Check interrupt status in PoolLru's reclamation thread 2025-08-06 13:26:00 +02:00
Viktor Lofgren
f085b4ea12 (skiplist) Fix tests 2025-08-06 13:24:14 +02:00
Viktor Lofgren
e208f7d3ba (skiplist) Code clean up an added validation 2025-08-06 12:55:04 +02:00
Viktor Lofgren
b577085cb2 (pool) Use one contiguous memory allocation to encourage a HugePage allocation and reduce TLB thrashing 2025-08-06 12:49:46 +02:00
Viktor Lofgren
b9240476f6 (pool) Use one contiguous memory allocation to encourage a HugePage allocation and reduce TLB thrashing 2025-08-06 12:48:14 +02:00
Viktor Lofgren
8f50f86d0b (index) Fix error handling 2025-08-05 22:19:23 +02:00
Viktor Lofgren
e3b7ead7a9 (skiplist) Fix aggessive forward pointering 2025-08-05 20:47:38 +02:00
Viktor Lofgren
9a845ba604 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 19:10:58 +02:00
Viktor Lofgren
b9381f1603 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 17:35:13 +02:00
Viktor Lofgren
6a60127267 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 16:54:39 +02:00
Viktor Lofgren
e8ffcfbb19 (skiplist) Correct binary search implementation, fix intersection logic 2025-08-04 14:49:09 +02:00
Viktor Lofgren
caf0850f81 (index) Clean up code 2025-08-04 00:12:35 +02:00
Viktor Lofgren
62e3bb675e (btree) Remove O_DIRECT btree implementation 2025-08-03 23:43:31 +02:00
Viktor Lofgren
4dc3e7da7a (perf) Remove warmup from perf test, it's not doing much 2025-08-03 21:19:54 +02:00
Viktor Lofgren
92b09883ec (index) Switch from AIO to io_uring
Turns AIO is just bad especially with buffered I/O, io_uring performs strictly better in this scenario.
2025-08-03 21:19:54 +02:00
Viktor Lofgren
87082b4ef8 (index) Use AIO for reading spans and positions
This performs slightly worse in benchmarks, but that's likely caused by hitting the page cache.

AIO will tend to perform better when we see cache misses, which is the expected case in production on real-world data.
2025-08-03 21:19:54 +02:00
Viktor Lofgren
84d3f6087f (skiplist) Parametrize skip list block size, increase to 4K pages 2025-08-03 21:19:54 +02:00
Viktor Lofgren
f93ba371a5 (pool) Fix the LRU to not deadlock and be shit 2025-08-03 21:19:54 +02:00
Viktor Lofgren
5eec27c68d (pool) Fix for 32 bit rollover in clockHand for LRU 2025-08-03 21:19:54 +02:00
Viktor Lofgren
ab01576f91 (pool) Use one global buffer pool instead of many small ones, improved LRU with gclock reclamation, skip list optimization 2025-08-03 21:19:54 +02:00
Viktor Lofgren
054e5ccf44 (pool) Testing synchronized to see if I can find the deadlock 2025-08-03 21:19:54 +02:00
Viktor Lofgren
4351ea5128 (pool) Fix buffer leak 2025-08-03 21:19:54 +02:00
Viktor Lofgren
49cfa3a5e9 (pool) Decrease LQB size 2025-08-03 21:19:54 +02:00
Viktor Lofgren
683854b23f (pool) Fix logging 2025-08-03 21:19:54 +02:00
Viktor Lofgren
e880fa8945 (pool) Simplify locking in PoolLru 2025-08-03 21:19:54 +02:00
Viktor Lofgren
2482dc572e (pool) Grow free queue size 2025-08-03 21:19:54 +02:00
Viktor Lofgren
4589f11898 (pool) More stats 2025-08-03 21:19:54 +02:00
Viktor Lofgren
e43b6e610b (pool) Adjust pool reclamation strategy 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4772117a1f (skiplist) First stab at a skiplist replacement for btrees in the documents lists 2025-08-03 21:19:53 +02:00
Viktor Lofgren
3fc7ea521c (pool) Remove readahead and simplify the code 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4372f5af03 (pool) More performant LRU pool + better instructions queue 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4ad89b6c75 (pool) More performant LRU pool 2025-08-03 21:19:53 +02:00
Viktor Lofgren
ad0519e031 (index) Optimizations 2025-08-03 21:19:53 +02:00
Viktor Lofgren
596ece1230 (pool) Fix deadlock during pool starvation 2025-08-03 21:19:53 +02:00
Viktor Lofgren
07b6e1585b (pool) Bump pool sizes 2025-08-03 21:19:53 +02:00
Viktor Lofgren
cb5e2778eb (pool) Align the buffers with 512b 2025-08-03 21:19:53 +02:00
Viktor Lofgren
8f5ea7896c (btree) More debug information on numEntries = 0 scenario 2025-08-03 21:19:53 +02:00
Viktor Lofgren
76c398e0b1 (index) Fix lingering issues with previous optimizations 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4a94f04a8d (btree) Debug logging 2025-08-03 21:19:53 +02:00
Viktor Lofgren
df72f670d4 (btree) Fix queryData 2025-08-03 21:19:53 +02:00
Viktor Lofgren
eaa22c2f5a (*) Logging 2025-08-03 21:19:53 +02:00
Viktor Lofgren
7be173aeca (pool) Only dump statistics if they say anything 2025-08-03 21:19:53 +02:00
Viktor Lofgren
36685bdca7 (btree) Fix retain implementation 2025-08-03 21:19:53 +02:00
Viktor Lofgren
ad04057609 (btree) Add short circuits when retain/rejecting on an empty tree 2025-08-03 21:19:53 +02:00
Viktor Lofgren
eb76ae22e2 (perf) Use lqb size 512 in perf test 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4b858ab341 (btree) Cache retain/reject reads 2025-08-03 21:19:53 +02:00
Viktor Lofgren
c6e3c8aa3b (index) Focus pools to try to increase reuse 2025-08-03 21:19:53 +02:00
Viktor Lofgren
9128d3907c (index) Periodically dump buffer metrics 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4ef16d13d4 (index) O_DIRECT based buffer pool for index reads 2025-07-30 15:04:23 +02:00
Viktor Lofgren
838a5626ec (index) Reduce query buffer size 2025-07-27 21:42:04 +02:00
Viktor Lofgren
6b426209c7 (index) Restore threshold for work stealing in query execution 2025-07-27 21:41:46 +02:00
Viktor Lofgren
452b5731d9 (index) Lower threshold for work stealing in query execution 2025-07-27 21:35:11 +02:00
Viktor Lofgren
c91cf49630 (search) Disable scribe.rip substitution
It does not appear to work well
2025-07-27 19:40:58 +02:00
Viktor Lofgren
8503030f18 (search) Fix rare exception in scribe.rip substitution 2025-07-27 19:38:52 +02:00
Viktor Lofgren
744f7d3ef7 (search) Fix rare exception in scribe.rip substitution 2025-07-27 19:34:03 +02:00
Viktor Lofgren
215e12afe9 (index) Shrink query buffer size 2025-07-27 17:33:46 +02:00
Viktor Lofgren
2716bce918 (index) Adjust timeout logic for evaluation 2025-07-27 17:28:34 +02:00
Viktor Lofgren
caf2e6fbb7 (index) Adjust timeout logic for evaluation 2025-07-27 17:27:07 +02:00
Viktor Lofgren
233f0acfb1 (index) Further reduce query buffer size 2025-07-27 17:13:08 +02:00
Viktor Lofgren
e3a4ff02e9 (index) Abandon ongoing evaluation tasks if time is up 2025-07-27 17:04:01 +02:00
Viktor Lofgren
c786283ae1 (index) Reduce quer buffer size 2025-07-27 16:57:55 +02:00
Viktor Lofgren
a3f65ac0e0 (deploy) Trigger index deployment 2025-07-27 16:50:23 +02:00
Viktor
aba1a32af0 Merge pull request #217 from MarginaliaSearch/uncompressed-spans-file
Index optimizations
2025-07-27 16:49:27 +02:00
Viktor Lofgren
c9c442345b (perf) Change execution test to use processing rate instead of count 2025-07-27 16:39:51 +02:00
Viktor Lofgren
2e126ba30e (perf) Change execution test to use processing rate instead of count 2025-07-27 16:37:20 +02:00
Viktor Lofgren
2087985f49 (index) Implement work stealing in IndexQueryExecution as a better approach to backpressure 2025-07-27 16:29:57 +02:00
Viktor Lofgren
2b13ebd18b (index) Tweak evaluation backlog handling 2025-07-27 16:08:16 +02:00
Viktor Lofgren
6d92c125fe (perf) Fix perf test 2025-07-27 15:50:28 +02:00
Viktor Lofgren
f638cfa39a (index) Avoid possibility of negative timeout 2025-07-27 15:39:12 +02:00
Viktor Lofgren
89447c12af (index) Avoid possibility of negative timeout 2025-07-27 15:24:47 +02:00
Viktor Lofgren
c71fc46f04 (perf) Update perf test with execution scenario 2025-07-27 15:22:07 +02:00
Viktor Lofgren
f96874d828 (sequence) Implement a largestValue abort condition for minDistance()
This is something like 3500% faster in certain common scenarios
2025-07-27 15:05:50 +02:00
Viktor Lofgren
583a84d5a0 (index) Clean up of the index query execution logic 2025-07-27 15:05:50 +02:00
Viktor Lofgren
f65b946448 (index) Clean up code 2025-07-27 15:05:50 +02:00
Viktor Lofgren
3682815855 (index) Optimize sequence intersection for the n=1 case 2025-07-26 19:14:32 +02:00
Viktor Lofgren
3a94357660 (index) Perf test tool (WIP!) 2025-07-26 11:49:33 +02:00
Viktor Lofgren
673b0d3de1 (index) Perf test tool (WIP!) 2025-07-26 11:49:31 +02:00
Viktor Lofgren
ea942bc664 (spans) Add signature to the footer of the spans file, including a version byte so we can detect whether ot use the old or new decoding logic 2025-07-25 12:07:18 +02:00
Viktor Lofgren
7ed5083c54 (index) Don't split results into chunks 2025-07-25 11:45:07 +02:00
Viktor Lofgren
08bb2c097b (refac) Clean up the data model used in the index service 2025-07-25 10:54:07 +02:00
Viktor Lofgren
495fb325be (sequence) Correct sequence intersection bug introduced in optimizations 2025-07-25 10:48:33 +02:00
Viktor Lofgren
05c25bbaec (chore) Clean up 2025-07-24 23:43:27 +02:00
Viktor Lofgren
2a028b84f3 (chore) Clean up 2025-07-24 20:12:56 +02:00
Viktor Lofgren
a091a23623 (ranking) Remove unnecessary metadata retrievals 2025-07-24 20:08:09 +02:00
Viktor Lofgren
e8897acb45 (ranking) Remove unnecessary metadata retrievals 2025-07-24 20:05:39 +02:00
Viktor Lofgren
b89ffcf2be (index) Evaluate hash based idx mapping in ForwardIndexReader 2025-07-24 19:47:27 +02:00
Viktor Lofgren
dbcc9055b0 (index) Evaluate using MinMaxPriorityQueue as guts of ResultPriorityQueue 2025-07-24 19:31:51 +02:00
Viktor Lofgren
d9740557f4 (sequence) Optimize intersection logic with a fast abort condition 2025-07-24 19:04:10 +02:00
Viktor Lofgren
0d6cd015fd (index) Evaluate reading all spans at once 2025-07-24 18:34:11 +02:00
Viktor Lofgren
c6034efcc8 (index) Cache value of bitset cardinality for speed 2025-07-24 17:24:55 +02:00
Viktor Lofgren
76068014ad (index) More spans optimizations 2025-07-24 15:03:43 +02:00
Viktor Lofgren
1c3ed67127 (index) Byte align document spans 2025-07-24 14:06:14 +02:00
Viktor Lofgren
fc0cb6bd9a (index) Reserve a larger size for IntArrayList in SeqenceOperations.findIntersections 2025-07-24 14:03:44 +02:00
Viktor Lofgren
c2601bac78 (converter) Remove unnecessary allocation of a 16 KB byte buffer 2025-07-24 13:25:37 +02:00
Viktor Lofgren
f5641b72e9 (index) Fix broken test 2025-07-24 13:21:05 +02:00
Viktor Lofgren
36efe2e219 (index) Optimize PositionsFileReader for concurrent reads
In benchmarks this is roughly twice as fast as the previous approach.  Main caveat being we need multiple file descriptors to avoid read instruction serialization by the kernel.  This is undesirable since the reads are complete scattershot and can't be reordered by the kernel in a way that optimizes anything.
2025-07-24 13:20:54 +02:00
Viktor Lofgren
983fe3829e (spans) Evaluate uncompressed spans files
Span decompression appears to be somewhat of a performance bottleneck.  This change removes compression of the spans file.  The spans are still compressed in transit between the converter and index constructor at this stage.  The change is intentionally kept small to just evaluate the performance implications, change in file sizes, etc.
2025-07-23 18:10:41 +02:00
Viktor Lofgren
668c87aa86 (ssr) Drop Executor from SSR as it no longer exists 2025-07-23 13:55:41 +02:00
Viktor Lofgren
9d3f9adb05 Force redeploy of everything 2025-07-23 13:36:02 +02:00
Viktor
a43a1773f1 Merge pull request #216 from MarginaliaSearch/deprecate-executor
Architecture: Remove the separate executor service and roll it into the index service.
2025-07-23 13:32:42 +02:00
Viktor Lofgren
1e7a3a3c4f (docs) Update docs to reflect the change 2025-07-23 13:18:23 +02:00
Viktor Lofgren
62b696b1c3 (architecture) Remove the separate executor service and merge it into the index service
The primary motivation for this is that in production, the large number of partitioned services has lead to an intermittent exhaustion of available database connections, as each service has a connection pool.

The decision to have a separate executor service dates back from when the index service was very slow to start, and the executor didn't always spin off its memory-hungry tasks into separate processes, which meant the executor would sometimes OOM and crash, and it was undesirable to bring the index down with it.
2025-07-23 12:57:13 +02:00
Viktor Lofgren
f1a900f383 (search) Clean up front page mobile design a bit 2025-07-23 12:20:40 +02:00
Viktor Lofgren
700364b86d (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 15:08:20 +02:00
164 changed files with 6377 additions and 2030 deletions

View File

@@ -105,8 +105,6 @@ public enum HtmlFeature {
}
public int getFeatureBit() {
if (getClass().desiredAssertionStatus() && ordinal() >= 32)
throw new IllegalStateException("Attempting to extract feature bit of " + name() + ", with ordinal " + ordinal());
return (1<< ordinal());
}
}

View File

@@ -7,7 +7,6 @@ public enum ServiceId {
Search("search-service"),
Index("index-service"),
Query("query-service"),
Executor("executor-service"),
Control("control-service"),

View File

@@ -189,7 +189,7 @@ public class ExecutorClient {
String uriPath = "/transfer/file/" + fileStorage.id();
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
if (endpoints.isEmpty()) {
throw new RuntimeException("No endpoints for node " + fileStorage.node());
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import com.google.inject.Inject;
import nu.marginalia.storage.FileStorageService;

View File

@@ -1,5 +1,5 @@
The execution subsystem is responsible for the execution of long running tasks on each
index node. It lives in the [executor-service](../services-core/executor-service) module.
index node. It lives in the [index-service](../services-core/index-service) module.
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
which permits program state to survive crashes and reboots.

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@@ -87,7 +87,7 @@ class FeedFetcherServiceTest extends AbstractModule {
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
bind(HikariDataSource.class).toInstance(dataSource);
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Index, 1, "", "", 0, UUID.randomUUID()));
bind(Integer.class).annotatedWith(Names.named("wmsa-system-node")).toInstance(1);
}

View File

@@ -304,7 +304,6 @@ public class QueryProtobufCodec {
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
specs.getDomainsList(),
specs.getSearchSetIdentifier(),
specs.getHumanQuery(),
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
IndexProtobufCodec.convertSpecLimit(specs.getSize()),

View File

@@ -18,8 +18,6 @@ public class SearchSpecification {
public String searchSetIdentifier;
public final String humanQuery;
public SpecificationLimit quality;
public SpecificationLimit year;
public SpecificationLimit size;
@@ -35,7 +33,6 @@ public class SearchSpecification {
public SearchSpecification(SearchQuery query,
List<Integer> domains,
String searchSetIdentifier,
String humanQuery,
SpecificationLimit quality,
SpecificationLimit year,
SpecificationLimit size,
@@ -47,7 +44,6 @@ public class SearchSpecification {
this.query = query;
this.domains = domains;
this.searchSetIdentifier = searchSetIdentifier;
this.humanQuery = humanQuery;
this.quality = quality;
this.year = year;
this.size = size;
@@ -73,10 +69,6 @@ public class SearchSpecification {
return this.searchSetIdentifier;
}
public String getHumanQuery() {
return this.humanQuery;
}
public SpecificationLimit getQuality() {
return this.quality;
}
@@ -106,14 +98,13 @@ public class SearchSpecification {
}
public String toString() {
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
}
public static class SearchSpecificationBuilder {
private SearchQuery query;
private List<Integer> domains;
private String searchSetIdentifier;
private String humanQuery;
private SpecificationLimit quality$value;
private boolean quality$set;
private SpecificationLimit year$value;
@@ -144,11 +135,6 @@ public class SearchSpecification {
return this;
}
public SearchSpecificationBuilder humanQuery(String humanQuery) {
this.humanQuery = humanQuery;
return this;
}
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
this.quality$value = quality;
this.quality$set = true;
@@ -205,11 +191,7 @@ public class SearchSpecification {
if (!this.rank$set) {
rank$value = SpecificationLimit.none();
}
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
}
public String toString() {
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
}
}
}

View File

@@ -1,56 +0,0 @@
package nu.marginalia.api.searchquery.model.results;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import java.util.BitSet;
public class ResultRankingContext {
private final int docCount;
public final RpcResultRankingParameters params;
public final BitSet regularMask;
public final BitSet ngramsMask;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt fullCounts;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt priorityCounts;
public ResultRankingContext(int docCount,
RpcResultRankingParameters params,
BitSet ngramsMask,
BitSet regularMask,
CqDataInt fullCounts,
CqDataInt prioCounts)
{
this.docCount = docCount;
this.params = params;
this.ngramsMask = ngramsMask;
this.regularMask = regularMask;
this.fullCounts = fullCounts;
this.priorityCounts = prioCounts;
}
public int termFreqDocCount() {
return docCount;
}
@Override
public String toString() {
return "ResultRankingContext{" +
"docCount=" + docCount +
", params=" + params +
", regularMask=" + regularMask +
", ngramsMask=" + ngramsMask +
", fullCounts=" + fullCounts +
", priorityCounts=" + priorityCounts +
'}';
}
}

View File

@@ -34,8 +34,6 @@ public class QueryFactory {
this.queryExpansion = queryExpansion;
}
public ProcessedQuery createQuery(QueryParams params,
@Nullable RpcResultRankingParameters rankingParams) {
final var query = params.humanQuery();
@@ -153,7 +151,6 @@ public class QueryFactory {
var specsBuilder = SearchSpecification.builder()
.query(queryBuilder.build())
.humanQuery(query)
.quality(qualityLimit)
.year(year)
.size(size)

View File

@@ -241,7 +241,6 @@ public class QueryFactoryTest {
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
Assertions.assertEquals("\"bob's cars\"", subquery.humanQuery);
}
@Test

View File

@@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:native')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')

View File

@@ -1,9 +1,11 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.index.forward.spans.IndexSpansReader;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -22,16 +24,15 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
* and a mapping between document identifiers to the index into the
* data array.
* <p/>
* Since the total data is relatively small, this is kept in memory to
* reduce the amount of disk thrashing.
* <p/>
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
*/
public class ForwardIndexReader {
private final LongArray ids;
private final LongArray data;
private final ForwardIndexSpansReader spansReader;
private volatile Long2IntOpenHashMap idsMap;
private final IndexSpansReader spansReader;
private final Logger logger = LoggerFactory.getLogger(getClass());
@@ -64,7 +65,22 @@ public class ForwardIndexReader {
ids = loadIds(idsFile);
data = loadData(dataFile);
spansReader = new ForwardIndexSpansReader(spansFile);
LinuxSystemCalls.madviseRandom(data.getMemorySegment());
LinuxSystemCalls.madviseRandom(ids.getMemorySegment());
spansReader = IndexSpansReader.open(spansFile);
Thread.ofPlatform().start(this::createIdsMap);
}
private void createIdsMap() {
Long2IntOpenHashMap idsMap = new Long2IntOpenHashMap((int) ids.size());
for (int i = 0; i < ids.size(); i++) {
idsMap.put(ids.get(i), i);
}
this.idsMap = idsMap;
logger.info("Forward index loaded into RAM");
}
private static LongArray loadIds(Path idsFile) throws IOException {
@@ -106,7 +122,11 @@ public class ForwardIndexReader {
private int idxForDoc(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = ids.binarySearch(docId, 0, ids.size());
if (idsMap != null) {
return idsMap.getOrDefault(docId, -1);
}
long offset = ids.binarySearch2(docId, 0, ids.size());
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
if (getClass().desiredAssertionStatus()) {
@@ -118,22 +138,27 @@ public class ForwardIndexReader {
return (int) offset;
}
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return new DocumentSpans();
long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
public DocumentSpans[] getDocumentSpans(Arena arena, long[] docIds) {
long[] offsets = new long[docIds.length];
for (int i = 0; i < docIds.length; i++) {
long offset = idxForDoc(docIds[i]);
if (offset >= 0) {
offsets[i] = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
}
else {
offsets[i] = -1;
}
}
try {
return spansReader.readSpans(arena, encodedOffset);
return spansReader.readSpans(arena, offsets);
}
catch (IOException ex) {
logger.error("Failed to read spans for doc " + docId, ex);
return new DocumentSpans();
logger.error("Failed to read spans for docIds", ex);
return new DocumentSpans[docIds.length];
}
}
public int totalDocCount() {
return (int) ids.size();
}
@@ -141,6 +166,8 @@ public class ForwardIndexReader {
public void close() {
if (data != null)
data.close();
if (ids != null)
ids.close();
}
public boolean isLoaded() {

View File

@@ -5,7 +5,7 @@ import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexParameters;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.index.forward.spans.IndexSpansWriter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
@@ -65,7 +65,7 @@ public class ForwardIndexConverter {
logger.info("Domain Rankings size = {}", domainRankings.size());
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter");
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
var spansWriter = new IndexSpansWriter(outputFileSpansData)
) {
progress.progress(TaskSteps.GET_DOC_IDS);

View File

@@ -11,6 +11,9 @@ public class DocumentSpan {
/** A list of the interlaced start and end positions of each span in the document of this type */
private final IntList startsEnds;
public DocumentSpan(IntList startsEnds) {
this.startsEnds = startsEnds;
}
public DocumentSpan(CodedSequence startsEnds) {
this.startsEnds = startsEnds.values();
}

View File

@@ -1,5 +1,6 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.CodedSequence;
@@ -39,6 +40,23 @@ public class DocumentSpans {
return EMPTY_SPAN;
}
void accept(byte code, IntList positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);
else if (code == HtmlTag.TITLE.code)
this.title = new DocumentSpan(positions);
else if (code == HtmlTag.NAV.code)
this.nav = new DocumentSpan(positions);
else if (code == HtmlTag.CODE.code)
this.code = new DocumentSpan(positions);
else if (code == HtmlTag.ANCHOR.code)
this.anchor = new DocumentSpan(positions);
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
this.externalLinkText = new DocumentSpan(positions);
else if (code == HtmlTag.BODY.code)
this.body = new DocumentSpan(positions);
}
void accept(byte code, CodedSequence positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);

View File

@@ -0,0 +1,25 @@
package nu.marginalia.index.forward.spans;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Path;
public interface IndexSpansReader extends AutoCloseable {
DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException;
DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException;
static IndexSpansReader open(Path fileName) throws IOException {
int version = SpansCodec.parseSpanFilesFooter(fileName);
if (version == SpansCodec.SpansCodecVersion.COMPRESSED.ordinal()) {
return new IndexSpansReaderCompressed(fileName);
}
else if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
return new IndexSpansReaderPlain(fileName);
}
else {
throw new IllegalArgumentException("Unsupported spans file version: " + version);
}
}
void close() throws IOException;
}

View File

@@ -10,11 +10,11 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
@SuppressWarnings("preview")
public class ForwardIndexSpansReader implements AutoCloseable {
@Deprecated
public class IndexSpansReaderCompressed implements AutoCloseable, IndexSpansReader {
private final FileChannel spansFileChannel;
public ForwardIndexSpansReader(Path spansFile) throws IOException {
public IndexSpansReaderCompressed(Path spansFile) throws IOException {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
}
@@ -51,6 +51,17 @@ public class ForwardIndexSpansReader implements AutoCloseable {
return ret;
}
@Override
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
for (int i = 0; i < encodedOffsets.length; i++) {
if (encodedOffsets[i] >= 0) {
ret[i] = readSpans(arena, encodedOffsets[i]);
}
}
return ret;
}
@Override
public void close() throws IOException {
spansFileChannel.close();

View File

@@ -0,0 +1,95 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import nu.marginalia.uring.UringFileReader;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.file.Path;
import java.util.List;
public class IndexSpansReaderPlain implements IndexSpansReader {
private final UringFileReader uringReader;
public IndexSpansReaderPlain(Path spansFile) throws IOException {
uringReader = new UringFileReader(spansFile, true);
uringReader.fadviseWillneed();
}
@Override
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
// for testing, slow
return readSpans(arena, new long[] { encodedOffset})[0];
}
public DocumentSpans decode(MemorySegment ms) {
int count = ms.get(ValueLayout.JAVA_INT, 0);
int pos = 4;
DocumentSpans ret = new DocumentSpans();
// Decode each span
for (int spanIdx = 0; spanIdx < count; spanIdx++) {
byte code = ms.get(ValueLayout.JAVA_BYTE, pos);
short len = ms.get(ValueLayout.JAVA_SHORT, pos+2);
IntArrayList values = new IntArrayList(len);
pos += 4;
for (int i = 0; i < len; i++) {
values.add(ms.get(ValueLayout.JAVA_INT, pos + 4*i));
}
ret.accept(code, values);
pos += 4*len;
}
return ret;
}
@Override
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) {
int readCnt = 0;
for (long offset : encodedOffsets) {
if (offset < 0)
continue;
readCnt ++;
}
if (readCnt == 0) {
return new DocumentSpans[encodedOffsets.length];
}
long[] offsets = new long[readCnt];
int[] sizes = new int[readCnt];
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
if (encodedOffsets[idx] < 0)
continue;
long offset = encodedOffsets[idx];
offsets[j] = SpansCodec.decodeStartOffset(offset);
sizes[j] = SpansCodec.decodeSize(offset);
j++;
}
List<MemorySegment> buffers = uringReader.readUnalignedInDirectMode(arena, offsets, sizes, 4096);
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
if (encodedOffsets[idx] < 0)
continue;
ret[idx] = decode(buffers.get(j++));
}
return ret;
}
@Override
public void close() throws IOException {
uringReader.close();
}
}

View File

@@ -1,20 +1,23 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.sequence.VarintCodedSequence;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class ForwardIndexSpansWriter implements AutoCloseable {
public class IndexSpansWriter implements AutoCloseable {
private final FileChannel outputChannel;
private final ByteBuffer work = ByteBuffer.allocate(32);
private final ByteBuffer work = ByteBuffer.allocate(4*1024*1024).order(ByteOrder.nativeOrder());
private long stateStartOffset = -1;
private int stateLength = -1;
public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException {
public IndexSpansWriter(Path outputFileSpansData) throws IOException {
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
}
@@ -23,7 +26,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
stateLength = 0;
work.clear();
work.put((byte) count);
work.putInt(count);
work.flip();
while (work.hasRemaining())
@@ -33,12 +36,17 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
work.clear();
work.put(spanCode);
work.putShort((short) sequenceData.remaining());
work.put((byte) 0); // Ensure we're byte aligned
var sequence = new VarintCodedSequence(sequenceData);
work.putShort((short) sequence.valueCount());
var iter = sequence.iterator();
while (iter.hasNext()) {
work.putInt(iter.nextInt());
}
work.flip();
while (work.hasRemaining() || sequenceData.hasRemaining()) {
stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData});
}
stateLength += outputChannel.write(work);
}
public long endRecord() {
@@ -47,6 +55,11 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
@Override
public void close() throws IOException {
ByteBuffer footer = SpansCodec.createSpanFilesFooter(SpansCodec.SpansCodecVersion.PLAIN, (int) (4096 - (outputChannel.position() & 4095)));
outputChannel.position(outputChannel.size());
while (footer.hasRemaining()) {
outputChannel.write(footer, outputChannel.size());
}
outputChannel.close();
}
}

View File

@@ -1,6 +1,21 @@
package nu.marginalia.index.forward.spans;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class SpansCodec {
public static int MAGIC_INT = 0xF000F000;
public static int FOOTER_SIZE = 8;
public enum SpansCodecVersion {
@Deprecated
COMPRESSED,
PLAIN
}
public static long encode(long startOffset, long size) {
assert size < 0x1000_0000L : "Size must be less than 2^28";
@@ -11,7 +26,39 @@ public class SpansCodec {
return encoded >>> 28;
}
public static long decodeSize(long encoded) {
return encoded & 0x0FFF_FFFFL;
public static int decodeSize(long encoded) {
return (int) (encoded & 0x0FFF_FFFFL);
}
public static ByteBuffer createSpanFilesFooter(SpansCodecVersion version, int padSize) {
if (padSize < FOOTER_SIZE) {
padSize += 4096;
}
ByteBuffer footer = ByteBuffer.allocate(padSize);
footer.position(padSize - FOOTER_SIZE);
footer.putInt(SpansCodec.MAGIC_INT);
footer.put((byte) version.ordinal());
footer.put((byte) 0);
footer.put((byte) 0);
footer.put((byte) 0);
footer.flip();
return footer;
}
public static int parseSpanFilesFooter(Path spansFile) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(FOOTER_SIZE);
try (var fc = FileChannel.open(spansFile, StandardOpenOption.READ)) {
if (fc.size() < FOOTER_SIZE) return 0;
fc.read(buffer, fc.size() - buffer.capacity());
buffer.flip();
int magic = buffer.getInt();
if (magic != MAGIC_INT) {
return 0;
}
return buffer.get();
}
}
}

View File

@@ -1,8 +1,9 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.index.forward.spans.IndexSpansReader;
import nu.marginalia.index.forward.spans.IndexSpansReaderPlain;
import nu.marginalia.index.forward.spans.IndexSpansWriter;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.VarintCodedSequence;
import org.junit.jupiter.api.AfterEach;
@@ -17,10 +18,10 @@ import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.*;
class ForwardIndexSpansReaderTest {
class IndexSpansReaderTest {
Path testFile = Files.createTempFile("test", ".idx");
ForwardIndexSpansReaderTest() throws IOException {
IndexSpansReaderTest() throws IOException {
}
@AfterEach
@@ -34,7 +35,7 @@ class ForwardIndexSpansReaderTest {
long offset1;
long offset2;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
try (var writer = new IndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate(1, 3, 5, 8).buffer());
offset1 = writer.endRecord();
@@ -46,7 +47,7 @@ class ForwardIndexSpansReaderTest {
offset2 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
try (var reader = IndexSpansReader.open(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);
@@ -77,13 +78,13 @@ class ForwardIndexSpansReaderTest {
@Test
void testContainsRange() throws IOException {
long offset1;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
try (var writer = new IndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
offset1 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
try (var reader = new IndexSpansReaderPlain(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);
@@ -104,13 +105,13 @@ class ForwardIndexSpansReaderTest {
@Test
void testContainsRangeExact() throws IOException {
long offset1;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
try (var writer = new IndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
offset1 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
try (var reader = new IndexSpansReaderPlain(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);
@@ -131,13 +132,13 @@ class ForwardIndexSpansReaderTest {
@Test
void testCountRangeMatches() throws IOException {
long offset1;
try (var writer = new ForwardIndexSpansWriter(testFile)) {
try (var writer = new IndexSpansWriter(testFile)) {
writer.beginRecord(1);
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
offset1 = writer.endRecord();
}
try (var reader = new ForwardIndexSpansReader(testFile);
try (var reader = new IndexSpansReaderPlain(testFile);
var arena = Arena.ofConfined()
) {
var spans1 = reader.readSpans(arena, offset1);

View File

@@ -0,0 +1,54 @@
plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
application {
mainClass = 'nu.marginalia.index.perftest.PerfTestMain'
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:db')
implementation project(':code:libraries:array')
implementation project(':code:libraries:native')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:common:linkdb')
implementation project(':code:index')
implementation project(':code:index:query')
implementation project(':code:index:index-forward')
implementation project(':code:index:index-reverse')
implementation project(':third-party:commons-codec')
implementation project(':code:functions:search-query')
implementation project(':code:functions:search-query:api')
implementation libs.slop
implementation libs.roaringbitmap
implementation libs.bundles.slf4j
implementation libs.guava
libs.bundles.grpc.get().each {
implementation dependencies.create(it) {
exclude group: 'com.google.guava'
}
}
implementation libs.notnull
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.gson
implementation libs.bundles.mariadb
}

View File

@@ -0,0 +1,262 @@
package nu.marginalia.index.perftest;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.uring.UringFileReader;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.stream.LongStream;
public class IoPatternsMain {
static void testBuffered(int sz, int small, int large, int iters) {
try {
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
long fileSize = Files.size(largeFile);
Random r = new Random();
List<MemorySegment> segments = new ArrayList<>();
for (int i = 0; i < sz; i++) {
if (small == large) {
segments.add(Arena.ofAuto().allocate(small));
}
else {
segments.add(Arena.ofAuto().allocate(r.nextInt(small, large)));
}
}
List<Long> offsets = new ArrayList<>();
long[] samples = new long[1000];
int si = 0;
try (UringFileReader reader = new UringFileReader(largeFile, false)) {
for (int iter = 0; iter < iters; ) {
if (si == samples.length) {
Arrays.sort(samples);
double p1 = samples[10] / 1_000.;
double p10 = samples[100] / 1_000.;
double p90 = samples[900] / 1_000.;
double p99 = samples[990] / 1_000.;
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
System.out.println("B"+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
si = 0;
iter++;
}
offsets.clear();
for (int i = 0; i < sz; i++) {
offsets.add(r.nextLong(0, fileSize - 256));
}
long st = System.nanoTime();
reader.read(segments, offsets);
long et = System.nanoTime();
samples[si++] = et - st;
}
}
}
catch (IOException e) {
e.printStackTrace();
}
}
static void testBufferedPread(int sz, int iters) {
try {
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
long fileSize = Files.size(largeFile);
Random r = new Random();
List<MemorySegment> segments = new ArrayList<>();
for (int i = 0; i < sz; i++) {
segments.add(Arena.ofAuto().allocate(r.nextInt(24, 256)));
}
List<Long> offsets = new ArrayList<>();
long[] samples = new long[1000];
int si = 0;
int fd = -1;
try {
fd = LinuxSystemCalls.openBuffered(largeFile);
LinuxSystemCalls.fadviseRandom(fd);
for (int iter = 0; iter < iters; ) {
if (si == samples.length) {
Arrays.sort(samples);
double p1 = samples[10] / 1_000.;
double p10 = samples[100] / 1_000.;
double p90 = samples[900] / 1_000.;
double p99 = samples[990] / 1_000.;
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
System.out.println("BP"+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
si = 0;
iter++;
}
offsets.clear();
for (int i = 0; i < sz; i++) {
offsets.add(r.nextLong(0, fileSize - 256));
}
long st = System.nanoTime();
for (int i = 0; i < sz; i++) {
LinuxSystemCalls.readAt(fd, segments.get(i), offsets.get(i));
}
long et = System.nanoTime();
samples[si++] = et - st;
}
}
finally {
LinuxSystemCalls.closeFd(fd);
}
}
catch (IOException e) {
e.printStackTrace();
}
}
static void testDirect(int blockSize, int sz, int iters) {
try {
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
int fileSizeBlocks = (int) ((Files.size(largeFile) & -blockSize) / blockSize);
Random r = new Random();
List<MemorySegment> segments = new ArrayList<>();
for (int i = 0; i < sz; i++) {
segments.add(Arena.ofAuto().allocate(blockSize, blockSize));
}
List<Long> offsets = new ArrayList<>();
long[] samples = new long[1000];
int si = 0;
try (UringFileReader reader = new UringFileReader(largeFile, true)) {
for (int iter = 0; iter < iters; ) {
if (si == samples.length) {
Arrays.sort(samples);
double p1 = samples[10] / 1_000.;
double p10 = samples[100] / 1_000.;
double p90 = samples[900] / 1_000.;
double p99 = samples[990] / 1_000.;
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
System.out.println("DN"+blockSize+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
si = 0;
iters++;
}
offsets.clear();
for (int i = 0; i < sz; i++) {
offsets.add(blockSize * r.nextLong(0, fileSizeBlocks));
}
long st = System.nanoTime();
reader.read(segments, offsets);
long et = System.nanoTime();
samples[si++] = et - st;
}
}
}
catch (IOException e) {
e.printStackTrace();
}
}
static void testDirect1(int blockSize, int iters) {
try {
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
int fileSizeBlocks = (int) ((Files.size(largeFile) & -blockSize) / blockSize);
Random r = new Random();
MemorySegment segment = Arena.global().allocate(blockSize, blockSize);
long[] samples = new long[1000];
int si = 0;
int fd = LinuxSystemCalls.openDirect(largeFile);
if (fd < 0) {
throw new IOException("open failed");
}
try {
for (int iter = 0; iter < iters; ) {
if (si == samples.length) {
Arrays.sort(samples);
double p1 = samples[10] / 1_000.;
double p10 = samples[100] / 1_000.;
double p90 = samples[900] / 1_000.;
double p99 = samples[990] / 1_000.;
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
System.out.println("D1"+blockSize+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
si = 0;
iters++;
}
long st = System.nanoTime();
int ret;
long readOffset = blockSize * r.nextLong(0, fileSizeBlocks);
if (blockSize != (ret = LinuxSystemCalls.readAt(fd, segment, readOffset))) {
throw new IOException("pread failed: " + ret);
}
long et = System.nanoTime();
samples[si++] = et - st;
}
}
finally {
LinuxSystemCalls.closeFd(fd);
}
}
catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
// Thread.ofPlatform().start(() -> testBuffered(128, 32, 65536,1000));
Thread.ofPlatform().start(() -> testDirect(8192*4, 128,1000));
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
// Thread.ofPlatform().start(() -> testBufferedPread(128, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024*1024, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024*1024, 1000));
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
// Thread.ofPlatform().start(() -> testDirect(512, 100));
// Thread.ofPlatform().start(() -> testDirect(512, 100));
// Thread.ofPlatform().start(() -> testDirect(512, 100));
// Thread.ofPlatform().start(() -> testDirect(512, 100));
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
// Thread.ofPlatform().start(() -> testBuffered(100));
// Thread.ofPlatform().start(() -> testBuffered(100));
for (;;);
// testBuffered(100);
}
}

View File

@@ -0,0 +1,313 @@
package nu.marginalia.index.perftest;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.IndexQueryExecution;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.DomainRankingOverrides;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSetAny;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeoutException;
public class PerfTestMain {
static Duration warmupTime = Duration.ofMinutes(1);
static Duration runTime = Duration.ofMinutes(10);
public static void main(String[] args) {
if (args.length != 4) {
System.err.println("Arguments: home-dir index-dir query");
System.exit(255);
}
try {
Path indexDir = Paths.get(args[0]);
if (!Files.isDirectory(indexDir)) {
System.err.println("Index directory is not a directory");
System.exit(255);
}
Path homeDir = Paths.get(args[1]);
String scenario = args[2];
String query = args[3];
switch (scenario) {
case "valuation" -> runValuation(indexDir, homeDir, query);
case "lookup" -> runLookup(indexDir, homeDir, query);
case "execution" -> runExecution(indexDir, homeDir, query);
}
System.exit(0);
}
catch (NumberFormatException e) {
System.err.println("Arguments: data-dir index-dir query");
System.exit(255);
}
catch (Exception ex) {
System.err.println("Error during testing");
ex.printStackTrace();
System.exit(255);
}
System.out.println(Arrays.toString(args));
}
private static CombinedIndexReader createCombinedIndexReader(Path indexDir) throws IOException {
return new CombinedIndexReader(
new ForwardIndexReader(
indexDir.resolve("ir/fwd-doc-id.dat"),
indexDir.resolve("ir/fwd-doc-data.dat"),
indexDir.resolve("ir/fwd-spans.dat")
),
new FullReverseIndexReader(
"full",
indexDir.resolve("ir/rev-words.dat"),
indexDir.resolve("ir/rev-docs.dat"),
new PositionsFileReader(indexDir.resolve("ir/rev-positions.dat"))
),
new PrioReverseIndexReader(
"prio",
indexDir.resolve("ir/rev-prio-words.dat"),
indexDir.resolve("ir/rev-prio-docs.dat")
)
);
}
private static IndexResultRankingService createIndexResultRankingService(Path indexDir, CombinedIndexReader combinedIndexReader) throws IOException, SQLException {
return new IndexResultRankingService(
new DocumentDbReader(indexDir.resolve("ldbr/documents.db")),
new StatefulIndex(combinedIndexReader),
new DomainRankingOverrides(null, Path.of("xxxx"))
);
}
static QueryFactory createQueryFactory(Path homeDir) throws IOException {
return new QueryFactory(
new QueryExpansion(
new TermFrequencyDict(homeDir.resolve("model/tfreq-new-algo3.bin")),
new NgramLexicon()
)
);
}
public static void runValuation(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException, TimeoutException {
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(10_000)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams, new IndexSearchBudget(10_000));
TLongArrayList allResults = new TLongArrayList();
LongQueryBuffer buffer = new LongQueryBuffer(512);
for (var query : queries) {
while (query.hasMore() && allResults.size() < 512 ) {
query.getMoreResults(buffer);
allResults.addAll(buffer.copyData());
}
if (allResults.size() >= 512)
break;
}
allResults.sort();
if (allResults.size() > 512) {
allResults.subList(512, allResults.size()).clear();
}
var rankingContext = ResultRankingContext.create(indexReader, searchParameters);
var rankingData = rankingService.prepareRankingData(rankingContext, new CombinedDocIdList(allResults.toArray()), null);
int sum = 0;
Instant runEndTime = Instant.now().plus(runTime);
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> times = new ArrayList<>();
int iter;
for (iter = 0;; iter++) {
IndexSearchBudget budget = new IndexSearchBudget(10000);
long start = System.nanoTime();
sum2 += rankingService.rankResults(budget, rankingContext, rankingData, false).size();
long end = System.nanoTime();
times.add((end - start)/1_000_000.);
if ((iter % 100) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
if (times.size() > 100) {
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + (allResults.size() / 512.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best times: " + (allResults.size() / 512.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum);
System.out.println("Main sum: " + sum2);
System.out.println(rankingData.size());
}
public static void runExecution(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException, InterruptedException {
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(50)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
System.out.println("Running warmup loop!");
int sum = 0;
Instant runEndTime = Instant.now().plus(runTime);
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> rates = new ArrayList<>();
List<Double> times = new ArrayList<>();
int iter;
for (iter = 0;; iter++) {
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
var execution = new IndexQueryExecution(searchParameters, rankingService, indexReader);
long start = System.nanoTime();
execution.run();
long end = System.nanoTime();
sum2 += execution.itemsProcessed();
rates.add(execution.itemsProcessed() / ((end - start)/1_000_000_000.));
times.add((end - start)/1_000_000.);
indexReader.reset();
if ((iter % 100) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
if (times.size() > 100) {
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best rates: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best counts: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum);
System.out.println("Main sum: " + sum2);
}
public static void runLookup(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException
{
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(10_000)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
Instant runEndTime = Instant.now().plus(runTime);
LongQueryBuffer buffer = new LongQueryBuffer(512);
int sum1 = 0;
int iter;
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> times = new ArrayList<>();
for (iter = 0;; iter++) {
indexReader.reset();
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams, new IndexSearchBudget(150));
long start = System.nanoTime();
for (var query : queries) {
while (query.hasMore()) {
query.getMoreResults(buffer);
sum1 += buffer.end;
buffer.reset();
}
}
long end = System.nanoTime();
times.add((end - start)/1_000_000_000.);
if ((iter % 10) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
if (times.size() > 100) {
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum1);
System.out.println("Main sum: " + sum2);
}
}

View File

@@ -15,6 +15,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:native')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:random-write-funnel')

View File

@@ -1,32 +1,26 @@
package nu.marginalia.index;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EntrySource;
import static java.lang.Math.min;
import nu.marginalia.skiplist.SkipListReader;
public class FullIndexEntrySource implements EntrySource {
private final String name;
private final BTreeReader reader;
int pos;
int endOffset;
final int entrySize;
private final SkipListReader reader;
private final long wordId;
public FullIndexEntrySource(String name,
BTreeReader reader,
int entrySize,
SkipListReader reader,
long wordId) {
this.name = name;
this.reader = reader;
this.entrySize = entrySize;
this.wordId = wordId;
pos = 0;
endOffset = pos + entrySize * reader.numEntries();
}
@Override
@@ -36,32 +30,14 @@ public class FullIndexEntrySource implements EntrySource {
@Override
public void read(LongQueryBuffer buffer) {
buffer.reset();
buffer.end = min(buffer.end, endOffset - pos);
reader.readData(buffer.data, buffer.end, pos);
pos += buffer.end;
destagger(buffer);
buffer.uniq();
}
private void destagger(LongQueryBuffer buffer) {
if (entrySize == 1)
return;
for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
buffer.data.set(wi, buffer.data.get(ri));
}
buffer.end /= entrySize;
reader.getData(buffer);
}
@Override
public boolean hasMore() {
return pos < endOffset;
return !reader.atEnd();
}
@Override
public String indexName() {
return name + ":" + Long.toHexString(wordId);

View File

@@ -2,16 +2,17 @@ package nu.marginalia.index;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.array.pool.BufferPool;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.query.ReverseIndexRejectFilter;
import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.query.*;
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
import nu.marginalia.index.query.filter.QueryFilterNoPass;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.skiplist.SkipListConstants;
import nu.marginalia.skiplist.SkipListReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -20,10 +21,12 @@ import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.Executors;
import java.util.function.Consumer;
public class FullReverseIndexReader {
private final LongArray words;
private final LongArray documents;
private final long wordsDataOffset;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final BTreeReader wordsBTreeReader;
@@ -31,6 +34,8 @@ public class FullReverseIndexReader {
private final PositionsFileReader positionsFileReader;
private final BufferPool dataPool;
public FullReverseIndexReader(String name,
Path words,
Path documents,
@@ -44,6 +49,7 @@ public class FullReverseIndexReader {
this.documents = null;
this.wordsBTreeReader = null;
this.wordsDataOffset = -1;
this.dataPool = null;
return;
}
@@ -52,6 +58,11 @@ public class FullReverseIndexReader {
this.words = LongArrayFactory.mmapForReadingShared(words);
this.documents = LongArrayFactory.mmapForReadingShared(documents);
LinuxSystemCalls.madviseRandom(this.words.getMemorySegment());
LinuxSystemCalls.madviseRandom(this.documents.getMemorySegment());
dataPool = new BufferPool(documents, SkipListConstants.BLOCK_SIZE, (int) (Long.getLong("index.bufferPoolSize", 512*1024*1024L) / SkipListConstants.BLOCK_SIZE));
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
@@ -62,6 +73,11 @@ public class FullReverseIndexReader {
}
}
public void reset() {
dataPool.reset();
}
private void selfTest() {
logger.info("Running self test program");
@@ -76,6 +92,15 @@ public class FullReverseIndexReader {
ReverseIndexSelfTest.runSelfTest6(wordsDataRange, documents);
}
public void eachDocRange(Consumer<LongArray> eachDocRange) {
long wordsDataSize = wordsBTreeReader.getHeader().numEntries() * 2L;
var wordsDataRange = words.range(wordsDataOffset, wordsDataOffset + wordsDataSize);
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
eachDocRange.accept(docsBTreeReader.data());
}
}
/** Calculate the offset of the word in the documents.
* If the return-value is negative, the term does not exist
@@ -101,27 +126,27 @@ public class FullReverseIndexReader {
if (offset < 0) // No documents
return new EmptyEntrySource();
return new FullIndexEntrySource(name, createReaderNew(offset), 2, termId);
return new FullIndexEntrySource(name, getReader(offset), termId);
}
/** Create a filter step requiring the specified termId to exist in the documents */
public QueryFilterStepIf also(long termId) {
public QueryFilterStepIf also(long termId, IndexSearchBudget budget) {
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new QueryFilterNoPass();
return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId);
return new ReverseIndexRetainFilter(getReader(offset), name, termId, budget);
}
/** Create a filter step requiring the specified termId to be absent from the documents */
public QueryFilterStepIf not(long termId) {
public QueryFilterStepIf not(long termId, IndexSearchBudget budget) {
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new QueryFilterLetThrough();
return new ReverseIndexRejectFilter(createReaderNew(offset));
return new ReverseIndexRejectFilter(getReader(offset), budget);
}
/** Return the number of documents with the termId in the index */
@@ -131,15 +156,39 @@ public class FullReverseIndexReader {
if (offset < 0)
return 0;
return createReaderNew(offset).numEntries();
return getReader(offset).estimateSize();
}
/** Create a BTreeReader for the document offset associated with a termId */
private BTreeReader createReaderNew(long offset) {
return new BTreeReader(
documents,
ReverseIndexParameters.fullDocsBTreeContext,
offset);
private SkipListReader getReader(long offset) {
return new SkipListReader(dataPool, offset);
}
public TermData[] getTermData(Arena arena,
long[] termIds,
long[] docIds)
{
long[] offsetsAll = new long[termIds.length * docIds.length];
for (int i = 0; i < termIds.length; i++) {
long termId = termIds[i];
long offset = wordOffset(termId);
if (offset < 0) {
// This is likely a bug in the code, but we can't throw an exception here
logger.debug("Missing offset for word {}", termId);
continue;
}
var reader = getReader(offset);
// Read the size and offset of the position data
var offsetsForTerm = reader.getValueOffsets(docIds);
System.arraycopy(offsetsForTerm, 0, offsetsAll, i * docIds.length, docIds.length);
}
return positionsFileReader.getTermData(arena, offsetsAll);
}
public TermData[] getTermData(Arena arena,
@@ -156,20 +205,22 @@ public class FullReverseIndexReader {
return ret;
}
var reader = createReaderNew(offset);
var reader = getReader(offset);
// Read the size and offset of the position data
var offsets = reader.queryData(docIds, 1);
var offsets = reader.getValueOffsets(docIds);
for (int i = 0; i < docIds.length; i++) {
if (offsets[i] == 0)
continue;
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
}
return ret;
return positionsFileReader.getTermData(arena, offsets);
}
public void close() {
try {
dataPool.close();
}
catch (Exception e) {
logger.warn("Error while closing bufferPool", e);
}
if (documents != null)
documents.close();

View File

@@ -13,7 +13,7 @@ import java.nio.channels.FileChannel;
public class PrioIndexEntrySource implements EntrySource {
private final String name;
private final ByteBuffer readData = ByteBuffer.allocate(1024);
private final ByteBuffer readData = ByteBuffer.allocate(8*1024);
private final BitReader bitReader = new BitReader(readData, this::fillReadBuffer);
private final FileChannel docsFileChannel;

View File

@@ -3,6 +3,7 @@ package nu.marginalia.index;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.EntrySource;
import org.slf4j.Logger;
@@ -40,6 +41,8 @@ public class PrioReverseIndexReader {
this.words = LongArrayFactory.mmapForReadingShared(words);
LinuxSystemCalls.madviseRandom(this.words.getMemorySegment());
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();

View File

@@ -5,7 +5,7 @@ import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexParameters
{
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_2048);
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_512);
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
}

View File

@@ -14,62 +14,103 @@ import java.nio.file.StandardOpenOption;
*
* The positions data is concatenated in the file, with each term's metadata
* followed by its positions. The metadata is a single byte, and the positions
* are encoded using the Elias Gamma code, with zero padded bits at the end to
* get octet alignment.
*
* are encoded varints.
* <p></p>
*
* It is the responsibility of the caller to keep track of the byte offset of
* each posting in the file.
*/
public class PositionsFileConstructor implements AutoCloseable {
private final ByteBuffer workBuffer = ByteBuffer.allocate(65536);
private final Path file;
private final FileChannel channel;
private long offset;
public PositionsFileConstructor(Path file) throws IOException {
this.file = file;
channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
}
/** Represents a block of positions lists. Each writer thread should hold on to
* a block object to ensure the locality of its positions lists.
* When finished, commit() must be run.
* */
public class PositionsFileBlock {
private final ByteBuffer workBuffer = ByteBuffer.allocate(1024*1024*16);
private long position;
public PositionsFileBlock(long position) {
this.position = position;
}
public boolean fitsData(int size) {
return workBuffer.remaining() >= size;
}
public void commit() throws IOException {
workBuffer.position(0);
workBuffer.limit(workBuffer.capacity());
int pos = 0;
while (workBuffer.hasRemaining()) {
pos += channel.write(workBuffer, this.position + pos + workBuffer.position());
}
}
private void relocate() throws IOException {
workBuffer.clear();
position = channel.position();
while (workBuffer.hasRemaining()) {
channel.write(workBuffer);
}
workBuffer.clear();
}
public long position() {
return this.position + workBuffer.position();
}
public void put(byte b) {
workBuffer.put(b);
}
public void put(ByteBuffer buffer) {
workBuffer.put(buffer);
}
}
public PositionsFileBlock getBlock() throws IOException {
synchronized (this) {
var block = new PositionsFileBlock(channel.position());
block.relocate();
return block;
}
}
/** Add a term to the positions file
*
* @param block a block token to ensure data locality
* @param termMeta the term metadata
* @param positionsBuffer the positions of the term
*
* @return the offset of the term in the file, with the size of the data in the highest byte
*/
public long add(byte termMeta, ByteBuffer positionsBuffer) throws IOException {
synchronized (file) {
int size = 1 + positionsBuffer.remaining();
public long add(PositionsFileBlock block, byte termMeta, ByteBuffer positionsBuffer) throws IOException {
int size = 1 + positionsBuffer.remaining();
if (workBuffer.remaining() < size) {
workBuffer.flip();
channel.write(workBuffer);
workBuffer.clear();
if (!block.fitsData(size)) {
synchronized (this) {
block.commit();
block.relocate();
}
}
synchronized (file) {
long offset = block.position();
workBuffer.put(termMeta);
workBuffer.put(positionsBuffer);
block.put(termMeta);
block.put(positionsBuffer);
long ret = PositionCodec.encode(size, offset);
offset += size;
return ret;
return PositionCodec.encode(size, offset);
}
}
public void close() throws IOException {
if (workBuffer.hasRemaining()) {
workBuffer.flip();
while (workBuffer.hasRemaining())
channel.write(workBuffer);
}
channel.force(false);
channel.close();
}

View File

@@ -1,46 +0,0 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.LongArrayTransformations;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.btree.model.BTreeContext;
import java.io.IOException;
/** Constructs the BTrees in a reverse index */
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
private final BTreeWriter writer;
private final int entrySize;
private final LongArray documentsArray;
long start = 0;
long writeOffset = 0;
public FullIndexBTreeTransformer(LongArray urlsFileMap,
int entrySize,
BTreeContext bTreeContext,
LongArray documentsArray) {
this.documentsArray = documentsArray;
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
this.entrySize = entrySize;
}
@Override
public long transform(long pos, long end) throws IOException {
final int size = (int) ((end - start) / entrySize);
if (size == 0) {
return -1;
}
final long offsetForBlock = writeOffset;
writeOffset += writer.write(writeOffset, size,
mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start)
);
start = end;
return offsetForBlock;
}
}

View File

@@ -0,0 +1,40 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.LongArrayTransformations;
import nu.marginalia.skiplist.SkipListWriter;
import java.io.IOException;
import java.nio.file.Path;
/** Constructs the BTrees in a reverse index */
public class FullIndexSkipListTransformer implements LongArrayTransformations.LongIOTransformer, AutoCloseable {
private final SkipListWriter writer;
private final LongArray documentsArray;
long start = 0;
public FullIndexSkipListTransformer(Path docsOutputFile,
LongArray documentsArray) throws IOException {
this.documentsArray = documentsArray;
this.writer = new SkipListWriter(docsOutputFile);
}
@Override
public long transform(long pos, long end) throws IOException {
final int size = (int) ((end - start) / 2);
if (size == 0) {
return -1;
}
long offset = writer.writeList(documentsArray, start, size);
start = end;
return offset;
}
public void close() throws IOException {
writer.close();
}
}

View File

@@ -6,7 +6,6 @@ import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.IndexJournalPage;
import org.slf4j.Logger;
@@ -81,15 +80,11 @@ public class FullPreindex {
// Estimate the size of the docs index data
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.fullDocsBTreeContext, 2);
offsets.fold(0, 0, offsets.size(), sizeEstimator);
// Write the docs file
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
offsets.transformEachIO(0, offsets.size(),
new FullIndexBTreeTransformer(finalDocs, 2,
ReverseIndexParameters.fullDocsBTreeContext,
documents.documents));
try (var transformer = new FullIndexSkipListTransformer(outputFileDocs, documents.documents)) {
offsets.transformEachIO(0, offsets.size(), transformer);
}
LongArray wordIds = segments.wordIds;
@@ -102,7 +97,7 @@ public class FullPreindex {
// Estimate the size of the words index data
long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
// Construct the tree
// Construct the keywords tree
LongArray wordsArray = LongArrayFactory.mmapForWritingConfined(outputFileWords, wordsSize);
new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
@@ -113,8 +108,6 @@ public class FullPreindex {
}
});
finalDocs.force();
finalDocs.close();
wordsArray.force();
wordsArray.close();

View File

@@ -12,10 +12,8 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.List;
/** A LongArray with document data, segmented according to
@@ -52,11 +50,6 @@ public class FullPreindexDocuments {
return new FullPreindexDocuments(docsFileMap, docsFile);
}
public FileChannel createDocumentsFileChannel() throws IOException {
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
}
public LongArray slice(long start, long end) {
return documents.range(start, end);
}
@@ -86,6 +79,8 @@ public class FullPreindexDocuments {
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
var positionsBlock = positionsFileConstructor.getBlock();
while (docIds.hasRemaining()) {
long docId = docIds.get();
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
@@ -101,12 +96,13 @@ public class FullPreindexDocuments {
ByteBuffer pos = tPos.get(i);
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
long encodedPosOffset = positionsFileConstructor.add(meta, pos);
long encodedPosOffset = positionsFileConstructor.add(positionsBlock, meta, pos);
assembly.put(offset + 0, rankEncodedId);
assembly.put(offset + 1, encodedPosOffset);
}
}
positionsBlock.commit();
assembly.write(docsFile);
}

View File

@@ -1,43 +1,72 @@
package nu.marginalia.index.positions;
import nu.marginalia.uring.UringFileReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.channels.FileChannel;
import java.lang.foreign.MemorySegment;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.List;
/** Reads positions data from the positions file */
public class PositionsFileReader implements AutoCloseable {
private final FileChannel positions;
private final UringFileReader uringFileReader;
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
public PositionsFileReader(Path positionsFile) throws IOException {
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
}
/** Get the positions for a term in the index, as pointed out by the encoded offset;
* intermediate buffers are allocated from the provided arena allocator. */
public TermData getTermData(Arena arena, long sizeEncodedOffset) {
int length = PositionCodec.decodeSize(sizeEncodedOffset);
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
var segment = arena.allocate(length);
var buffer = segment.asByteBuffer();
try {
positions.read(buffer, offset);
} catch (IOException e) {
throw new RuntimeException(e);
if ((Files.size(positionsFile) & 4095) != 0) {
throw new IllegalArgumentException("Positions file is not block aligned in size: " + Files.size(positionsFile));
}
return new TermData(buffer);
uringFileReader = new UringFileReader(positionsFile, true);
}
@Override
public void close() throws IOException {
positions.close();
uringFileReader.close();
}
/** Get the positions for a keywords in the index, as pointed out by the encoded offsets;
* intermediate buffers are allocated from the provided arena allocator. */
public TermData[] getTermData(Arena arena, long[] offsets) {
int cnt = 0;
for (int i = 0; i < offsets.length; i++) {
long encodedOffset = offsets[i];
if (encodedOffset == 0) continue;
cnt++;
}
if (cnt == 0) {
return new TermData[offsets.length];
}
long[] readOffsets = new long[cnt];
int[] readSizes = new int[cnt];
for (int i = 0, j = 0; i < offsets.length; i++) {
long encodedOffset = offsets[i];
if (encodedOffset == 0) continue;
readSizes[j] = PositionCodec.decodeSize(encodedOffset);
readOffsets[j] = PositionCodec.decodeOffset(encodedOffset);
j++;
}
List<MemorySegment> buffers = uringFileReader.readUnalignedInDirectMode(arena, readOffsets, readSizes, 4096);
TermData[] ret = new TermData[offsets.length];
for (int i = 0, j=0; i < offsets.length; i++) {
long encodedOffset = offsets[i];
if (encodedOffset == 0) continue;
ret[i] = new TermData(buffers.get(j++).asByteBuffer());
}
return ret;
}
}

View File

@@ -1,24 +1,22 @@
package nu.marginalia.index.query;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.skiplist.SkipListReader;
public record ReverseIndexRejectFilter(BTreeReader range) implements QueryFilterStepIf {
public record ReverseIndexRejectFilter(SkipListReader range, IndexSearchBudget budget) implements QueryFilterStepIf {
@Override
public void apply(LongQueryBuffer buffer) {
range.rejectEntries(buffer);
while (budget.hasTimeLeft() && range.tryRejectData(buffer));
buffer.finalizeFiltering();
}
public boolean test(long id) {
return range.findEntry(id) < 0;
}
@Override
public double cost() {
return range.numEntries();
return 1;
}
@Override

View File

@@ -1,24 +1,21 @@
package nu.marginalia.index.query;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.skiplist.SkipListReader;
public record ReverseIndexRetainFilter(BTreeReader range, String name, long wordId) implements QueryFilterStepIf {
public record ReverseIndexRetainFilter(SkipListReader range, String name, long wordId, IndexSearchBudget budget) implements QueryFilterStepIf {
@Override
public void apply(LongQueryBuffer buffer) {
range.retainEntries(buffer);
buffer.finalizeFiltering();
}
while (budget.hasTimeLeft() && range.tryRetainData(buffer));
public boolean test(long id) {
return range.findEntry(id) >= 0;
buffer.finalizeFiltering();
}
@Override
public double cost() {
return range.numEntries();
return 1;
}
@Override

View File

@@ -11,7 +11,6 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -32,32 +31,32 @@ class PositionsFileReaderTest {
@Test
void getTermData() throws IOException {
ByteBuffer workArea = ByteBuffer.allocate(8192);
long key1, key2, key3;
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
key1 = constructor.add((byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
key2 = constructor.add((byte) 51, VarintCodedSequence.generate(2, 3, 5, 1000, 5000, 20241).buffer());
key3 = constructor.add((byte) 61, VarintCodedSequence.generate(3, 5, 7).buffer());
var block = constructor.getBlock();
key1 = constructor.add(block, (byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
key2 = constructor.add(block, (byte) 51, VarintCodedSequence.generate(2, 3, 5, 1000, 5000, 20241).buffer());
key3 = constructor.add(block, (byte) 61, VarintCodedSequence.generate(3, 5, 7).buffer());
block.commit();
}
System.out.println("key1: " + Long.toHexString(key1));
System.out.println("key2: " + Long.toHexString(key2));
System.out.println("key3: " + Long.toHexString(key3));
try (Arena arena = Arena.ofConfined();
try (Arena arena = Arena.ofShared();
PositionsFileReader reader = new PositionsFileReader(file))
{
TermData data1 = reader.getTermData(arena, key1);
assertEquals(43, data1.flags());
assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
TermData[] data = reader.getTermData(arena, new long[] { key1, key2, key3 });
TermData data2 = reader.getTermData(arena, key2);
assertEquals(51, data2.flags());
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
assertEquals(43, data[0].flags());
assertEquals(IntList.of( 1, 2, 3), data[0].positions().values());
TermData data3 = reader.getTermData(arena, key3);
assertEquals(61, data3.flags());
assertEquals(IntList.of(3, 5, 7), data3.positions().values());
assertEquals(51, data[1].flags());
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data[1].positions().values());
assertEquals(61, data[2].flags());
assertEquals(IntList.of(3, 5, 7), data[2].positions().values());
}
}
}

View File

@@ -1,149 +0,0 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.model.BTreeHeader;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class FullPreindexFinalizeTest {
TestJournalFactory journalFactory;
Path positionsFile;
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
positionsFile = Files.createTempFile("positions", ".dat");
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test
public void testFinalizeSimple() throws IOException {
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
var preindex = FullPreindex.constructPreindex(reader,
new PositionsFileConstructor(positionsFile),
DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
preindex.delete();
Path wordsFile = tempDir.resolve("words.dat");
Path docsFile = tempDir.resolve("docs.dat");
assertTrue(Files.exists(wordsFile));
assertTrue(Files.exists(docsFile));
System.out.println(Files.size(wordsFile));
System.out.println(Files.size(docsFile));
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
var docsHeader = new BTreeHeader(docsArray, 0);
var wordsHeader = new BTreeHeader(wordsArray, 0);
assertEquals(1, docsHeader.numEntries());
assertEquals(1, wordsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
}
@Test
public void testFinalizeSimple2x2() throws IOException {
var reader = journalFactory.createReader(
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
new EntryDataWithWordMeta(101, 101, wm(51, 52))
);
var preindex = FullPreindex.constructPreindex(reader,
new PositionsFileConstructor(positionsFile),
DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
preindex.delete();
Path wordsFile = tempDir.resolve("words.dat");
Path docsFile = tempDir.resolve("docs.dat");
assertTrue(Files.exists(wordsFile));
assertTrue(Files.exists(docsFile));
System.out.println(Files.size(wordsFile));
System.out.println(Files.size(docsFile));
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
var wordsHeader = new BTreeHeader(wordsArray, 0);
System.out.println(wordsHeader);
assertEquals(2, wordsHeader.numEntries());
long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1);
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
BTreeHeader docsHeader;
docsHeader = new BTreeHeader(docsArray, offset1);
System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
docsHeader = new BTreeHeader(docsArray, offset2);
System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries());
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
}
}

View File

@@ -7,24 +7,13 @@ import io.grpc.stub.StreamObserver;
import io.prometheus.client.Counter;
import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.searchset.SmallSearchSet;
@@ -35,14 +24,7 @@ import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.util.BitSet;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
@Singleton
public class IndexGrpcService
@@ -88,23 +70,22 @@ public class IndexGrpcService
private final StatefulIndex statefulIndex;
private final SearchSetsService searchSetsService;
private final IndexResultRankingService resultValuator;
private final IndexResultRankingService rankingService;
private final String nodeName;
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
@Inject
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
StatefulIndex statefulIndex,
SearchSetsService searchSetsService,
IndexResultRankingService resultValuator)
IndexResultRankingService rankingService)
{
var nodeId = serviceConfiguration.node();
this.nodeName = Integer.toString(nodeId);
this.statefulIndex = statefulIndex;
this.searchSetsService = searchSetsService;
this.resultValuator = resultValuator;
this.rankingService = rankingService;
}
// GRPC endpoint
@@ -121,7 +102,13 @@ public class IndexGrpcService
.time(() -> {
// Perform the search
try {
return executeSearch(params);
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
return new IndexQueryExecution(params, rankingService, statefulIndex.get()).run();
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
@@ -157,7 +144,12 @@ public class IndexGrpcService
// exists for test access
public List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
try {
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
return new IndexQueryExecution(new SearchParameters(specsSet, getSearchSet(specsSet)), rankingService, statefulIndex.get()).run();
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
@@ -183,262 +175,6 @@ public class IndexGrpcService
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
}
// accessible for tests
public List<RpcDecoratedResultItem> executeSearch(SearchParameters params) throws Exception {
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
params.compiledQuery,
params.compiledQueryIds);
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
List<RpcDecoratedResultItem> ret = queryExecution.run(params);
wmsa_index_query_exec_block_time
.labels(nodeName)
.set(queryExecution.getBlockTime() / 1000.);
wmsa_index_query_exec_stall_time
.labels(nodeName)
.set(queryExecution.getStallTime() / 1000.);
return ret;
}
/** This class is responsible for ranking the results and adding the best results to the
* resultHeap, which depending on the state of the indexLookup threads may or may not block
*/
private ResultRankingContext createRankingContext(RpcResultRankingParameters rankingParams,
CompiledQuery<String> compiledQuery,
CompiledQueryLong compiledQueryIds)
{
int[] full = new int[compiledQueryIds.size()];
int[] prio = new int[compiledQueryIds.size()];
BitSet ngramsMask = new BitSet(compiledQuery.size());
BitSet regularMask = new BitSet(compiledQuery.size());
var currentIndex = statefulIndex.get();
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
full[idx] = currentIndex.numHits(id);
prio[idx] = currentIndex.numHitsPrio(id);
if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx);
}
else {
regularMask.set(idx);
}
}
return new ResultRankingContext(currentIndex.totalDocCount(),
rankingParams,
ngramsMask,
regularMask,
new CqDataInt(full),
new CqDataInt(prio));
}
/** This class is responsible for executing a search query. It uses a thread pool to
* execute the subqueries and their valuation in parallel. The results are then combined
* into a bounded priority queue, and finally the best results are returned.
*/
private class QueryExecution {
private static final Executor workerPool = Executors.newCachedThreadPool();
/** The queue where the results from the index lookup threads are placed,
* pending ranking by the result ranker threads */
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
= new ArrayBlockingQueue<>(64);
private final ResultPriorityQueue resultHeap;
private final ResultRankingContext resultRankingContext;
private final AtomicInteger remainingIndexTasks = new AtomicInteger(0);
private final AtomicInteger remainingValuationTasks = new AtomicInteger(0);
private final AtomicLong blockTime = new AtomicLong(0);
private final AtomicLong stallTime = new AtomicLong(0);
public long getStallTime() {
return stallTime.get();
}
public long getBlockTime() {
return blockTime.get();
}
private QueryExecution(ResultRankingContext resultRankingContext, int maxResults) {
this.resultRankingContext = resultRankingContext;
this.resultHeap = new ResultPriorityQueue(maxResults);
}
/** Execute a search query */
public List<RpcDecoratedResultItem> run(SearchParameters parameters) throws Exception {
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
var currentIndex = statefulIndex.get();
for (var indexQuery : currentIndex.createQueries(terms, parameters.queryParams)) {
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
}
for (int i = 0; i < indexValuationThreads; i++) {
workerPool.execute(new ResultRanker(parameters, resultRankingContext));
}
// Wait for all tasks to complete
awaitCompletion();
// Return the best results
return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap);
}
/** Wait for all tasks to complete */
private void awaitCompletion() throws InterruptedException {
synchronized (remainingValuationTasks) {
while (remainingValuationTasks.get() > 0) {
remainingValuationTasks.wait(20);
}
}
}
/** This class is responsible for executing a subquery and adding the results to the
* resultCandidateQueue, which depending on the state of the valuator threads may
* or may not block */
class IndexLookup implements Runnable {
private final IndexQuery query;
private final IndexSearchBudget budget;
IndexLookup(IndexQuery query,
IndexSearchBudget budget) {
this.query = query;
this.budget = budget;
remainingIndexTasks.incrementAndGet();
}
public void run() {
try {
executeSearch();
}
catch (Exception ex) {
logger.error("Error in index lookup", ex);
}
finally {
synchronized (remainingIndexTasks) {
if (remainingIndexTasks.decrementAndGet() == 0) {
remainingIndexTasks.notifyAll();
}
}
}
}
private void executeSearch() {
final LongArrayList results = new LongArrayList(16);
// These queries are different indices for one subquery
final LongQueryBuffer buffer = new LongQueryBuffer(4096);
while (query.hasMore() && budget.hasTimeLeft())
{
buffer.reset();
query.getMoreResults(buffer);
for (int i = 0; i < buffer.end; i+=16) {
for (int j = 0; j < Math.min(buffer.end - i, 16); j++) {
results.add(buffer.data.get(i+j));
}
enqueueResults(new CombinedDocIdList(results));
results.clear();
}
}
buffer.dispose();
}
private void enqueueResults(CombinedDocIdList resultIds) {
long remainingTime = budget.timeLeft();
try {
if (!resultCandidateQueue.offer(resultIds)) {
long start = System.currentTimeMillis();
resultCandidateQueue.offer(resultIds, remainingTime, TimeUnit.MILLISECONDS);
blockTime.addAndGet(System.currentTimeMillis() - start);
}
}
catch (InterruptedException e) {
logger.warn("Interrupted while waiting to offer resultIds to queue", e);
}
}
}
class ResultRanker implements Runnable {
private final SearchParameters parameters;
private final ResultRankingContext rankingContext;
ResultRanker(SearchParameters parameters, ResultRankingContext rankingContext) {
this.parameters = parameters;
this.rankingContext = rankingContext;
remainingValuationTasks.incrementAndGet();
}
public void run() {
try {
while (parameters.budget.timeLeft() > 0 && execute());
}
catch (InterruptedException e) {
logger.warn("Interrupted while waiting to poll resultIds from queue", e);
}
catch (Exception e) {
logger.error("Exception while ranking results", e);
}
finally {
synchronized (remainingValuationTasks) {
if (remainingValuationTasks.decrementAndGet() == 0)
remainingValuationTasks.notifyAll();
}
}
}
private boolean execute() throws Exception {
long start = System.currentTimeMillis();
// Do a relatively short poll to ensure we terminate in a timely manner
// in the event all work is done
final long pollTime = Math.clamp(parameters.budget.timeLeft(), 1, 5);
CombinedDocIdList resultIds = resultCandidateQueue.poll(pollTime, TimeUnit.MILLISECONDS);
if (resultIds == null) {
// check if we are done and can terminate
if (remainingIndexTasks.get() == 0 && resultCandidateQueue.isEmpty()) {
return false;
}
}
else {
stallTime.addAndGet(System.currentTimeMillis() - start);
resultHeap.addAll(
resultValuator.rankResults(parameters, false, rankingContext, resultIds)
);
}
return true; // keep going
}
}
}
}

View File

@@ -0,0 +1,228 @@
package nu.marginalia.index;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.skiplist.SkipListConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicLong;
/** Performs an index query */
public class IndexQueryExecution {
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
private static final int indexPreparationThreads = Integer.getInteger("index.preparationThreads", 4);
// Since most NVMe drives have a maximum read size of 128 KB, and most small reads are 512B
// this should probably be 128*1024 / 512 = 256 to reduce queue depth and optimize tail latency
private static final int evaluationBatchSize = 256;
// This should probably be SkipListConstants.BLOCK_SIZE / 16 in order to reduce the number of unnecessary read
// operations per lookup and again optimize tail latency
private static final int lookupBatchSize = SkipListConstants.BLOCK_SIZE / 16;
private static final AtomicLong lookupTime = new AtomicLong();
private static final AtomicLong prepTime = new AtomicLong();
private static final AtomicLong valuationTime = new AtomicLong();
private static final ExecutorService threadPool = new ThreadPoolExecutor(indexValuationThreads, Integer.MAX_VALUE, 60L, TimeUnit.SECONDS, new SynchronousQueue<>());
private static final Logger log = LoggerFactory.getLogger(IndexQueryExecution.class);
private final IndexResultRankingService rankingService;
private final ResultRankingContext rankingContext;
private final List<IndexQuery> queries;
private final IndexSearchBudget budget;
private final ResultPriorityQueue resultHeap;
private final CountDownLatch lookupCountdown;
private final CountDownLatch preparationCountdown;
private final CountDownLatch rankingCountdown;
private final ArrayBlockingQueue<CombinedDocIdList> fullPreparationQueue = new ArrayBlockingQueue<>(8, true);
private final ArrayBlockingQueue<CombinedDocIdList> priorityPreparationQueue = new ArrayBlockingQueue<>(8, true);
private final ArrayBlockingQueue<IndexResultRankingService.RankingData> fullEvaluationQueue = new ArrayBlockingQueue<>(8, true);
private final ArrayBlockingQueue<IndexResultRankingService.RankingData> priorityEvaluationQueue = new ArrayBlockingQueue<>(8, true);
private final int limitTotal;
private final int limitByDomain;
static {
Thread.ofPlatform().daemon().start(() -> {
for (;;) {
try {
TimeUnit.SECONDS.sleep(10);
}
catch (InterruptedException e) {
e.printStackTrace();
break;
}
log.info("Lookup: {}, Valuation: {}, Prep Time: {}", lookupTime.get() / 1_000_000_000., valuationTime.get() / 1_000_000_000., prepTime.get() / 1_000_000_000.);
}
});
}
public IndexQueryExecution(SearchParameters params,
IndexResultRankingService rankingService,
CombinedIndexReader currentIndex) {
this.rankingService = rankingService;
resultHeap = new ResultPriorityQueue(params.fetchSize);
budget = params.budget;
limitByDomain = params.limitByDomain;
limitTotal = params.limitTotal;
rankingContext = ResultRankingContext.create(currentIndex, params);
queries = currentIndex.createQueries(new SearchTerms(params.query, params.compiledQueryIds), params.queryParams, budget);
lookupCountdown = new CountDownLatch(queries.size());
preparationCountdown = new CountDownLatch(indexPreparationThreads * 2);
rankingCountdown = new CountDownLatch(indexValuationThreads * 2);
}
public List<RpcDecoratedResultItem> run() throws InterruptedException, SQLException {
for (IndexQuery query : queries) {
threadPool.submit(() -> lookup(query));
}
for (int i = 0; i < indexPreparationThreads; i++) {
threadPool.submit(() -> prepare(priorityPreparationQueue, priorityEvaluationQueue));
threadPool.submit(() -> prepare(fullPreparationQueue, fullEvaluationQueue));
}
// Spawn lookup tasks for each query
for (int i = 0; i < indexValuationThreads; i++) {
threadPool.submit(() -> evaluate(priorityEvaluationQueue));
threadPool.submit(() -> evaluate(fullEvaluationQueue));
}
// Await lookup task termination
lookupCountdown.await();
preparationCountdown.await();
rankingCountdown.await();
// Deallocate any leftover ranking data buffers
for (var data : priorityEvaluationQueue) {
data.close();
}
for (var data : fullEvaluationQueue) {
data.close();
}
// Final result selection
return rankingService.selectBestResults(limitByDomain, limitTotal, rankingContext, resultHeap.toList());
}
private List<Future<?>> lookup(IndexQuery query) {
final LongQueryBuffer buffer = new LongQueryBuffer(lookupBatchSize);
List<Future<?>> evaluationJobs = new ArrayList<>();
try {
while (query.hasMore() && budget.hasTimeLeft()) {
buffer.zero();
long st = System.nanoTime();
query.getMoreResults(buffer);
long et = System.nanoTime();
lookupTime.addAndGet(et - st);
if (buffer.isEmpty())
continue;
var queue = query.isPrioritized() ? priorityPreparationQueue : fullPreparationQueue;
if (buffer.end <= evaluationBatchSize) {
var docIds = new CombinedDocIdList(buffer);
if (!queue.offer(docIds, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
break;
}
else {
long[] bufferData = buffer.copyData();
for (int start = 0; start < bufferData.length; start+= evaluationBatchSize) {
long[] slice = Arrays.copyOfRange(bufferData, start,
Math.min(start + evaluationBatchSize, bufferData.length));
var docIds = new CombinedDocIdList(slice);
if (!queue.offer(docIds, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
break;
}
}
}
} catch (RuntimeException | InterruptedException ex) {
log.error("Exception in lookup thread", ex);
} finally {
buffer.dispose();
lookupCountdown.countDown();
}
return evaluationJobs;
}
private void prepare(ArrayBlockingQueue<CombinedDocIdList> inputQueue, ArrayBlockingQueue<IndexResultRankingService.RankingData> outputQueue) {
try {
while (budget.hasTimeLeft() && (lookupCountdown.getCount() > 0 || !inputQueue.isEmpty())) {
var docIds = inputQueue.poll(Math.clamp(budget.timeLeft(), 1, 5), TimeUnit.MILLISECONDS);
if (docIds == null) continue;
long st = System.nanoTime();
var preparedData = rankingService.prepareRankingData(rankingContext, docIds, budget);
long et = System.nanoTime();
prepTime.addAndGet(et - st);
if (!outputQueue.offer(preparedData, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
preparedData.close();
}
} catch (TimeoutException ex) {
// This is normal
} catch (Exception ex) {
if (!(ex.getCause() instanceof InterruptedException)) {
log.error("Exception in lookup thread", ex);
} // suppress logging for interrupted ex
} finally {
preparationCountdown.countDown();
}
}
private void evaluate(ArrayBlockingQueue<IndexResultRankingService.RankingData> queue) {
try {
while (budget.hasTimeLeft() && (preparationCountdown.getCount() > 0 || !queue.isEmpty())) {
var rankingData = queue.poll(Math.clamp(budget.timeLeft(), 1, 5), TimeUnit.MILLISECONDS);
if (rankingData == null) continue;
try (rankingData) {
long st = System.nanoTime();
resultHeap.addAll(rankingService.rankResults(budget, rankingContext, rankingData, false));
long et = System.nanoTime();
valuationTime.addAndGet(et - st);
}
}
} catch (Exception ex) {
if (!(ex.getCause() instanceof InterruptedException)) {
log.error("Exception in lookup thread", ex);
} // suppress logging for interrupted ex
} finally {
rankingCountdown.countDown();
}
}
public int itemsProcessed() {
return resultHeap.getItemsProcessed();
}
}

View File

@@ -1,5 +1,6 @@
package nu.marginalia.index;
import com.google.common.collect.MinMaxPriorityQueue;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import org.jetbrains.annotations.NotNull;
@@ -9,108 +10,52 @@ import java.util.*;
/** A priority queue for search results. This class is not thread-safe,
* in general, except for concurrent use of the addAll method.
* <p></p>
* The class implements a subset of the Collection interface, and
* is intended to be used as a priority queue for search results,
* with a maximum size.
* <p></p>
* Since the expected use case is to add a large number of items
* and then iterate over the items, the class is optimized for
* this scenario, and does not implement other mutating methods
* than addAll().
*/
public class ResultPriorityQueue implements Iterable<SearchResultItem>,
Collection<SearchResultItem> {
private final int limit;
private final ArrayList<SearchResultItem> backingList = new ArrayList<>();
public class ResultPriorityQueue implements Iterable<SearchResultItem> {
private final LongOpenHashSet idsInSet = new LongOpenHashSet();
private final MinMaxPriorityQueue<SearchResultItem> queue;
private int itemsProcessed = 0;
public ResultPriorityQueue(int limit) {
this.limit = limit;
this.queue = MinMaxPriorityQueue.<SearchResultItem>orderedBy(Comparator.naturalOrder()).maximumSize(limit).create();
}
public Iterator<SearchResultItem> iterator() {
return backingList.iterator();
}
@NotNull
@Override
public Object[] toArray() {
return backingList.toArray();
}
@NotNull
@Override
public <T> T[] toArray(@NotNull T[] a) {
return backingList.toArray(a);
}
@Override
public boolean add(SearchResultItem searchResultItem) {
throw new UnsupportedOperationException("Use addAll instead");
}
@Override
public boolean remove(Object o) {
throw new UnsupportedOperationException();
}
@Override
public boolean containsAll(@NotNull Collection<?> c) {
return idsInSet.containsAll(c);
return queue.iterator();
}
/** Adds all items to the queue, and returns true if any items were added.
* This is a thread-safe operation.
*/
@Override
public synchronized boolean addAll(@NotNull Collection<? extends SearchResultItem> items) {
boolean itemsAdded = false;
for (var item: items) {
if (idsInSet.add(item.getDocumentId())) {
backingList.add(item);
itemsAdded = true;
}
}
if (!itemsAdded) {
return false;
}
itemsProcessed+=items.size();
backingList.sort(Comparator.naturalOrder());
if (backingList.size() > limit) {
backingList.subList(limit, backingList.size()).clear();
for (var item : items) {
if (idsInSet.add(item.getDocumentId())) {
queue.add(item);
}
}
return true;
}
@Override
public boolean removeAll(@NotNull Collection<?> c) {
throw new UnsupportedOperationException();
}
@Override
public boolean retainAll(@NotNull Collection<?> c) {
throw new UnsupportedOperationException();
}
@Override
public void clear() {
backingList.clear();
idsInSet.clear();
public synchronized List<SearchResultItem> toList() {
return new ArrayList<>(queue);
}
public int size() {
return backingList.size();
return queue.size();
}
public int getItemsProcessed() {
return itemsProcessed;
}
@Override
public boolean isEmpty() {
return backingList.isEmpty();
}
@Override
public boolean contains(Object o) {
return backingList.contains(o);
return queue.isEmpty();
}
}

View File

@@ -5,14 +5,17 @@ import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
@@ -25,6 +28,7 @@ import org.slf4j.LoggerFactory;
import java.lang.foreign.Arena;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
@@ -55,20 +59,19 @@ public class CombinedIndexReader {
return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
}
public QueryFilterStepIf hasWordFull(long termId) {
return reverseIndexFullReader.also(termId);
public QueryFilterStepIf hasWordFull(long termId, IndexSearchBudget budget) {
return reverseIndexFullReader.also(termId, budget);
}
/** Creates a query builder for terms in the priority index */
public IndexQueryBuilder findPriorityWord(long wordId) {
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId), true))
.withSourceTerms(wordId);
}
/** Creates a query builder for terms in the full index */
public IndexQueryBuilder findFullWord(long wordId) {
return newQueryBuilder(
new IndexQuery(reverseIndexFullReader.documents(wordId)))
return newQueryBuilder(new IndexQuery(reverseIndexFullReader.documents(wordId), false))
.withSourceTerms(wordId);
}
@@ -82,7 +85,12 @@ public class CombinedIndexReader {
return reverseIndexFullReader.numDocuments(word);
}
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
/** Reset caches and buffers */
public void reset() {
reverseIndexFullReader.reset();
}
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params, IndexSearchBudget budget) {
if (!isLoaded()) {
logger.warn("Index reader not ready");
@@ -123,7 +131,7 @@ public class CombinedIndexReader {
continue;
}
head.addInclusionFilter(hasWordFull(termId));
head.addInclusionFilter(hasWordFull(termId, budget));
}
queryHeads.add(head);
}
@@ -132,7 +140,7 @@ public class CombinedIndexReader {
if (paths.size() < 4) {
var prioHead = findPriorityWord(elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
prioHead.addInclusionFilter(hasWordFull(elements.getLong(i)));
prioHead.addInclusionFilter(hasWordFull(elements.getLong(i), budget));
}
queryHeads.add(prioHead);
}
@@ -143,11 +151,11 @@ public class CombinedIndexReader {
// Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
for (long term : terms.advice()) {
query = query.also(term);
query = query.also(term, budget);
}
for (long term : terms.excludes()) {
query = query.not(term);
query = query.not(term, budget);
}
// Run these filter steps last, as they'll worst-case cause as many page faults as there are
@@ -178,6 +186,18 @@ public class CombinedIndexReader {
}
/** Retrieves the term metadata for the specified word for the provided documents */
public TermMetadataList[] getTermMetadata(Arena arena,
long[] wordIds,
CombinedDocIdList docIds)
{
TermData[] combinedTermData = reverseIndexFullReader.getTermData(arena, wordIds, docIds.array());
TermMetadataList[] ret = new TermMetadataList[wordIds.length];
for (int i = 0; i < wordIds.length; i++) {
ret[i] = new TermMetadataList(Arrays.copyOfRange(combinedTermData, i*docIds.size(), (i+1)*docIds.size()));
}
return ret;
}
public TermMetadataList getTermMetadata(Arena arena,
long wordId,
CombinedDocIdList docIds)
@@ -205,14 +225,19 @@ public class CombinedIndexReader {
return forwardIndexReader.getDocumentSize(docId);
}
/** Retrieves the document spans for the specified document */
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
return forwardIndexReader.getDocumentSpans(arena, docId);
/** Retrieves the document spans for the specified documents */
public DocumentSpans[] getDocumentSpans(Arena arena, CombinedDocIdList docIds) {
long[] decodedIDs = docIds.array();
for (int i = 0; i < decodedIDs.length; i++) {
decodedIDs[i] = UrlIdCodec.removeRank(decodedIDs[i]);
}
return forwardIndexReader.getDocumentSpans(arena, decodedIDs);
}
/** Close the indexes (this is not done immediately)
* */
public void close() throws InterruptedException {
public void close() {
/* Delay the invocation of close method to allow for a clean shutdown of the service.
*
* This is especially important when using Unsafe-based LongArrays, since we have
@@ -227,7 +252,7 @@ public class CombinedIndexReader {
}
private void delayedCall(Runnable call, Duration delay) throws InterruptedException {
private void delayedCall(Runnable call, Duration delay) {
Thread.ofPlatform().start(() -> {
try {
TimeUnit.SECONDS.sleep(delay.toSeconds());
@@ -248,25 +273,47 @@ public class CombinedIndexReader {
class ParamMatchingQueryFilter implements QueryFilterStepIf {
private final QueryParams params;
private final ForwardIndexReader forwardIndexReader;
private final boolean imposesMetaConstraint;
public ParamMatchingQueryFilter(QueryParams params,
ForwardIndexReader forwardIndexReader)
{
this.params = params;
this.forwardIndexReader = forwardIndexReader;
this.imposesMetaConstraint = params.imposesDomainMetadataConstraint();
}
@Override
public void apply(LongQueryBuffer buffer) {
if (!imposesMetaConstraint && !params.searchSet().imposesConstraint()) {
return;
}
while (buffer.hasMore()) {
if (test(buffer.currentValue())) {
buffer.retainAndAdvance();
}
else {
buffer.rejectAndAdvance();
}
}
buffer.finalizeFiltering();
}
public boolean test(long combinedId) {
long docId = UrlIdCodec.removeRank(combinedId);
int domainId = UrlIdCodec.getDomainId(docId);
long meta = forwardIndexReader.getDocMeta(docId);
if (!validateDomain(domainId, meta)) {
if (!validateDomain(domainId)) {
return false;
}
if (!imposesMetaConstraint) {
return true;
}
long meta = forwardIndexReader.getDocMeta(docId);
if (!validateQuality(meta)) {
return false;
}
@@ -286,8 +333,8 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
return true;
}
private boolean validateDomain(int domainId, long meta) {
return params.searchSet().contains(domainId, meta);
private boolean validateDomain(int domainId) {
return params.searchSet().contains(domainId);
}
private boolean validateQuality(long meta) {
@@ -338,4 +385,5 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
public String describe() {
return getClass().getSimpleName();
}
}

View File

@@ -1,11 +1,10 @@
package nu.marginalia.index.index;
import java.util.List;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
@@ -32,18 +31,18 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
return this;
}
public IndexQueryBuilder also(long termId) {
public IndexQueryBuilder also(long termId, IndexSearchBudget budget) {
if (alreadyConsideredTerms.add(termId)) {
query.addInclusionFilter(reverseIndexFullReader.also(termId));
query.addInclusionFilter(reverseIndexFullReader.also(termId, budget));
}
return this;
}
public IndexQueryBuilder not(long termId) {
public IndexQueryBuilder not(long termId, IndexSearchBudget budget) {
query.addInclusionFilter(reverseIndexFullReader.not(termId));
query.addInclusionFilter(reverseIndexFullReader.not(termId, budget));
return this;
}
@@ -55,20 +54,6 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
return this;
}
public IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterSteps) {
if (filterSteps.isEmpty())
return this;
if (filterSteps.size() == 1) {
query.addInclusionFilter(filterSteps.getFirst());
}
else {
query.addInclusionFilter(new QueryFilterAnyOf(filterSteps));
}
return this;
}
public IndexQuery build() {
return query;
}

View File

@@ -35,6 +35,13 @@ public class StatefulIndex {
this.eventLog = eventLog;
}
/** For use in testing only */
public StatefulIndex(CombinedIndexReader combinedIndexReader) {
this.combinedIndexReader = combinedIndexReader;
this.servicesFactory = null;
this.eventLog = null;
}
public void init() {
Lock lock = indexReplacementLock.writeLock();

View File

@@ -1,8 +1,9 @@
package nu.marginalia.index.model;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.searchset.SearchSet;
import java.util.Objects;
@@ -41,6 +42,13 @@ public final class QueryParams {
this.queryStrategy = queryStrategy;
}
public boolean imposesDomainMetadataConstraint() {
return qualityLimit.type() != SpecificationLimitType.NONE
|| year.type() != SpecificationLimitType.NONE
|| size.type() != SpecificationLimitType.NONE
|| rank.type() != SpecificationLimitType.NONE;
}
public SpecificationLimit qualityLimit() {
return qualityLimit;
}

View File

@@ -0,0 +1,106 @@
package nu.marginalia.index.model;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.index.index.CombinedIndexReader;
import java.util.BitSet;
public class ResultRankingContext {
private final int docCount;
public final RpcResultRankingParameters params;
public final SearchQuery searchQuery;
public final QueryParams queryParams;
public final CompiledQuery<String> compiledQuery;
public final CompiledQueryLong compiledQueryIds;
public final BitSet regularMask;
public final BitSet ngramsMask;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt fullCounts;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt priorityCounts;
public static ResultRankingContext create(CombinedIndexReader currentIndex, SearchParameters searchParameters) {
var compiledQueryIds = searchParameters.compiledQueryIds;
var compiledQuery = searchParameters.compiledQuery;
int[] full = new int[compiledQueryIds.size()];
int[] prio = new int[compiledQueryIds.size()];
BitSet ngramsMask = new BitSet(compiledQuery.size());
BitSet regularMask = new BitSet(compiledQuery.size());
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
full[idx] = currentIndex.numHits(id);
prio[idx] = currentIndex.numHitsPrio(id);
if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx);
}
else {
regularMask.set(idx);
}
}
return new ResultRankingContext(currentIndex.totalDocCount(),
searchParameters,
compiledQuery,
compiledQueryIds,
ngramsMask,
regularMask,
new CqDataInt(full),
new CqDataInt(prio));
}
public ResultRankingContext(int docCount,
SearchParameters searchParameters,
CompiledQuery<String> compiledQuery,
CompiledQueryLong compiledQueryIds,
BitSet ngramsMask,
BitSet regularMask,
CqDataInt fullCounts,
CqDataInt prioCounts)
{
this.docCount = docCount;
this.searchQuery = searchParameters.query;
this.params = searchParameters.rankingParams;
this.queryParams = searchParameters.queryParams;
this.compiledQuery = compiledQuery;
this.compiledQueryIds = compiledQueryIds;
this.ngramsMask = ngramsMask;
this.regularMask = regularMask;
this.fullCounts = fullCounts;
this.priorityCounts = prioCounts;
}
public int termFreqDocCount() {
return docCount;
}
@Override
public String toString() {
return "ResultRankingContext{" +
"docCount=" + docCount +
", params=" + params +
", regularMask=" + regularMask +
", ngramsMask=" + ngramsMask +
", fullCounts=" + fullCounts +
", priorityCounts=" + priorityCounts +
'}';
}
}

View File

@@ -43,7 +43,7 @@ public class SearchParameters {
var limits = specsSet.queryLimits;
this.fetchSize = limits.getFetchSize();
this.budget = new IndexSearchBudget(limits.getTimeoutMs());
this.budget = new IndexSearchBudget(Math.max(limits.getTimeoutMs()/2, limits.getTimeoutMs()-50));
this.query = specsSet.query;
this.limitByDomain = limits.getResultsByDomain();
this.limitTotal = limits.getResultsTotal();
@@ -67,9 +67,7 @@ public class SearchParameters {
this.fetchSize = limits.getFetchSize();
// The time budget is halved because this is the point when we start to
// wrap up the search and return the results.
this.budget = new IndexSearchBudget(limits.getTimeoutMs() / 2);
this.budget = new IndexSearchBudget(Math.max(limits.getTimeoutMs()/2, limits.getTimeoutMs()-50));
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
this.limitByDomain = limits.getResultsByDomain();

View File

@@ -2,7 +2,7 @@ package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.index.model.ResultRankingContext;
import java.util.BitSet;
import java.util.List;

View File

@@ -12,13 +12,14 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
@@ -30,9 +31,15 @@ import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.lang.foreign.Arena;
import java.sql.SQLException;
import java.util.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
@Singleton
public class IndexResultRankingService {
@@ -52,96 +59,142 @@ public class IndexResultRankingService {
this.domainRankingOverrides = domainRankingOverrides;
}
public List<SearchResultItem> rankResults(SearchParameters params,
boolean exportDebugData,
ResultRankingContext rankingContext,
CombinedDocIdList resultIds)
{
if (resultIds.isEmpty())
return List.of();
public RankingData prepareRankingData(ResultRankingContext rankingContext, CombinedDocIdList resultIds, @Nullable IndexSearchBudget budget) throws TimeoutException {
return new RankingData(rankingContext, resultIds, budget);
}
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext, params);
public final class RankingData implements AutoCloseable {
final Arena arena;
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
private final TermMetadataList[] termsForDocs;
private final DocumentSpans[] documentSpans;
private final long[] flags;
private final CodedSequence[] positions;
private final CombinedDocIdList resultIds;
private final QuerySearchTerms searchTerms;
private AtomicBoolean closed = new AtomicBoolean(false);
int pos = -1;
// Get the current index reader, which is the one we'll use for this calculation,
// this may change during the calculation, but we don't want to switch over mid-calculation
final CombinedIndexReader currentIndex = statefulIndex.get();
public RankingData(ResultRankingContext rankingContext, CombinedDocIdList resultIds, @Nullable IndexSearchBudget budget) throws TimeoutException {
this.resultIds = resultIds;
this.arena = Arena.ofShared();
final QuerySearchTerms searchTerms = getSearchTerms(params.compiledQuery, params.query);
final int termCount = searchTerms.termIdsAll.size();
this.searchTerms = getSearchTerms(rankingContext.compiledQuery, rankingContext.searchQuery);
final int termCount = searchTerms.termIdsAll.size();
// We use an arena for the position data to avoid gc pressure
// from the gamma coded sequences, which can be large and have a lifetime
// that matches the try block here
try (var arena = Arena.ofConfined()) {
this.flags = new long[termCount];
this.positions = new CodedSequence[termCount];
TermMetadataList[] termsForDocs = new TermMetadataList[termCount];
for (int ti = 0; ti < termCount; ti++) {
termsForDocs[ti] = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.at(ti), resultIds);
}
// Get the current index reader, which is the one we'll use for this calculation,
// this may change during the calculation, but we don't want to switch over mid-calculation
// Data for the document. We arrange this in arrays outside the calculation function to avoid
// hash lookups in the inner loop, as it's hot code, and we don't want unnecessary cpu cache
// thrashing in there; out here we can rely on implicit array ordering to match up the data.
final CombinedIndexReader currentIndex = statefulIndex.get();
long[] flags = new long[termCount];
CodedSequence[] positions = new CodedSequence[termCount];
// Perform expensive I/O operations
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
// term data arrays as well
this.termsForDocs = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.array, resultIds);
if (!budget.hasTimeLeft())
throw new TimeoutException();
this.documentSpans = currentIndex.getDocumentSpans(arena, resultIds);
}
for (int i = 0; i < resultIds.size(); i++) {
public CodedSequence[] positions() {
return positions;
}
public long[] flags() {
return flags;
}
public long resultId() {
return resultIds.at(pos);
}
public DocumentSpans documentSpans() {
return documentSpans[pos];
}
// Prepare term-level data for the document
public boolean next() {
if (++pos < resultIds.size()) {
for (int ti = 0; ti < flags.length; ti++) {
var tfd = termsForDocs[ti];
assert tfd != null : "No term data for term " + ti;
flags[ti] = tfd.flag(i);
positions[ti] = tfd.position(i);
flags[ti] = tfd.flag(pos);
positions[ti] = tfd.position(pos);
}
return true;
}
return false;
}
// Ignore documents that don't match the mandatory constraints
if (!searchTerms.phraseConstraints.testMandatory(positions)) {
continue;
}
public int size() {
return resultIds.size();
}
if (!exportDebugData) {
var score = resultRanker.calculateScore(arena, null, resultIds.at(i), searchTerms, flags, positions);
if (score != null) {
results.add(score);
}
}
else {
var rankingFactors = new DebugRankingFactors();
var score = resultRanker.calculateScore(arena, rankingFactors, resultIds.at(i), searchTerms, flags, positions);
if (score != null) {
score.debugRankingFactors = rankingFactors;
results.add(score);
}
}
public void close() {
if (closed.compareAndSet(false, true)) {
arena.close();
}
}
}
public List<SearchResultItem> rankResults(
IndexSearchBudget budget,
ResultRankingContext rankingContext,
RankingData rankingData,
boolean exportDebugData)
{
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext);
List<SearchResultItem> results = new ArrayList<>(rankingData.size());
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
// term data arrays as well
var searchTerms = rankingData.searchTerms;
while (rankingData.next() && budget.hasTimeLeft()) {
// Ignore documents that don't match the mandatory constraints
if (!searchTerms.phraseConstraints.testMandatory(rankingData.positions())) {
continue;
}
return results;
if (!exportDebugData) {
var score = resultRanker.calculateScore(null, rankingData.resultId(), searchTerms, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
if (score != null) {
results.add(score);
}
}
else {
var rankingFactors = new DebugRankingFactors();
var score = resultRanker.calculateScore( rankingFactors, rankingData.resultId(), searchTerms, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
if (score != null) {
score.debugRankingFactors = rankingFactors;
results.add(score);
}
}
}
return results;
}
public List<RpcDecoratedResultItem> selectBestResults(SearchParameters params,
public List<RpcDecoratedResultItem> selectBestResults(int limitByDomain,
int limitTotal,
ResultRankingContext resultRankingContext,
Collection<SearchResultItem> results) throws SQLException {
List<SearchResultItem> results) throws SQLException {
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
TLongList idsList = new TLongArrayList(params.limitTotal);
TLongList idsList = new TLongArrayList(limitTotal);
for (var item : results) {
if (domainCountFilter.test(item)) {
if (resultsList.size() < params.limitTotal) {
if (resultsList.size() < limitTotal) {
resultsList.add(item);
idsList.add(item.getDocumentId());
}
@@ -159,19 +212,26 @@ public class IndexResultRankingService {
// for the selected results, as this would be comically expensive to do for all the results we
// discard along the way
if (params.rankingParams.getExportDebugData()) {
if (resultRankingContext.params.getExportDebugData()) {
var combinedIdsList = new LongArrayList(resultsList.size());
for (var item : resultsList) {
combinedIdsList.add(item.combinedId);
}
resultsList.clear();
resultsList.addAll(this.rankResults(
params,
true,
resultRankingContext,
new CombinedDocIdList(combinedIdsList))
);
IndexSearchBudget budget = new IndexSearchBudget(10000);
try (var data = prepareRankingData(resultRankingContext, new CombinedDocIdList(combinedIdsList), null)) {
resultsList.addAll(this.rankResults(
budget,
resultRankingContext,
data,
true)
);
}
catch (TimeoutException ex) {
// this won't happen since we passed null for budget
}
}
// Fetch the document details for the selected results in one go, from the local document database
@@ -247,7 +307,7 @@ public class IndexResultRankingService {
var termOutputs = RpcResultTermRankingOutputs.newBuilder();
CqDataLong termIds = params.compiledQueryIds.data;;
CqDataLong termIds = resultRankingContext.compiledQueryIds.data;
for (var entry : debugFactors.getTermFactors()) {
String term = "[ERROR IN LOOKUP]";
@@ -255,7 +315,7 @@ public class IndexResultRankingService {
// CURSED: This is a linear search, but the number of terms is small, and it's in a debug path
for (int i = 0; i < termIds.size(); i++) {
if (termIds.get(i) == entry.termId()) {
term = params.compiledQuery.at(i);
term = resultRankingContext.compiledQuery.at(i);
break;
}
}

View File

@@ -6,14 +6,13 @@ import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.QuerySearchTerms;
@@ -28,7 +27,6 @@ import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable;
import java.lang.foreign.Arena;
import java.util.BitSet;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
@@ -47,24 +45,23 @@ public class IndexResultScoreCalculator {
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
DomainRankingOverrides domainRankingOverrides,
ResultRankingContext rankingContext,
SearchParameters params)
ResultRankingContext rankingContext)
{
this.index = statefulIndex.get();
this.domainRankingOverrides = domainRankingOverrides;
this.rankingContext = rankingContext;
this.queryParams = params.queryParams;
this.compiledQuery = params.compiledQuery;
this.queryParams = rankingContext.queryParams;
this.compiledQuery = rankingContext.compiledQuery;
}
@Nullable
public SearchResultItem calculateScore(Arena arena,
@Nullable DebugRankingFactors debugRankingFactors,
public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
long combinedId,
QuerySearchTerms searchTerms,
long[] wordFlags,
CodedSequence[] positions)
CodedSequence[] positions,
DocumentSpans spans)
{
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
@@ -92,8 +89,6 @@ public class IndexResultScoreCalculator {
int docSize = index.getDocumentSize(docId);
if (docSize <= 0) docSize = 5000;
DocumentSpans spans = index.getDocumentSpans(arena, docId);
if (debugRankingFactors != null) {
debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
@@ -235,7 +230,7 @@ public class IndexResultScoreCalculator {
long result = 0;
int bit = 0;
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions).intIterator();
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions, 64).intIterator();
while (intersection.hasNext() && bit < 64) {
bit = (int) (Math.sqrt(intersection.nextInt()));

View File

@@ -3,7 +3,7 @@ package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.index.model.ResultRankingContext;
import nu.marginalia.model.idx.WordFlags;
import java.util.List;

View File

@@ -58,6 +58,7 @@ public class PhraseConstraintGroupList {
private final int[] offsets;
private final BitSet present;
private final BitSet termIdsMask;
private final int presentCardinality;
public final int size;
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
@@ -85,6 +86,8 @@ public class PhraseConstraintGroupList {
termIdsMask.set(idx);
}
}
presentCardinality = present.cardinality();
}
/** Returns true if the term with index termIdx in the query is in the group */
@@ -93,7 +96,7 @@ public class PhraseConstraintGroupList {
}
public boolean test(CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];
IntIterator[] sequences = new IntIterator[presentCardinality];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
@@ -120,7 +123,7 @@ public class PhraseConstraintGroupList {
public IntList findIntersections(IntList[] positions) {
IntList[] sequences = new IntList[present.cardinality()];
IntList[] sequences = new IntList[presentCardinality];
int[] iterOffsets = new int[sequences.length];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
@@ -144,12 +147,41 @@ public class PhraseConstraintGroupList {
iterOffsets[si - 1] = -oi;
}
return SequenceOperations.findIntersections(sequences, iterOffsets);
return SequenceOperations.findIntersections(sequences, iterOffsets, Integer.MAX_VALUE);
}
public IntList findIntersections(IntList[] positions, int n) {
IntList[] sequences = new IntList[presentCardinality];
int[] iterOffsets = new int[sequences.length];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
continue;
}
int offset = offsets[oi];
if (offset < 0)
return IntList.of();
// Create iterators that are offset by their relative position in the
// sequence. This is done by subtracting the index from the offset,
// so that when we intersect them, an overlap means that the terms are
// in the correct order. Note the offset is negative!
var posForTerm = positions[offset];
if (posForTerm == null) {
return IntList.of();
}
sequences[si++] = posForTerm;
iterOffsets[si - 1] = -oi;
}
return SequenceOperations.findIntersections(sequences, iterOffsets, n);
}
public int minDistance(IntList[] positions) {
List<IntList> sequences = new ArrayList<>(present.cardinality());
IntList iterOffsets = new IntArrayList(present.cardinality());
List<IntList> sequences = new ArrayList<>(presentCardinality);
IntList iterOffsets = new IntArrayList(presentCardinality);
for (int oi = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {

View File

@@ -1,7 +1,7 @@
package nu.marginalia.index.results.model.ids;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import nu.marginalia.array.page.LongQueryBuffer;
import java.util.Arrays;
import java.util.stream.LongStream;
@@ -17,17 +17,23 @@ public final class CombinedDocIdList {
public CombinedDocIdList(long... data) {
this.data = Arrays.copyOf(data, data.length);
}
public CombinedDocIdList(LongQueryBuffer buffer) {
this.data = buffer.copyData();
}
public CombinedDocIdList(LongArrayList data) {
this.data = data.toLongArray();
}
public CombinedDocIdList(Roaring64Bitmap data) {
this.data = data.toArray();
}
public CombinedDocIdList() {
this.data = new long[0];
}
public static CombinedDocIdList combineLists(CombinedDocIdList one, CombinedDocIdList other) {
long[] data = new long[one.size() + other.size()];
System.arraycopy(one.data, 0, data, 0, one.data.length);
System.arraycopy(other.data, 0, data, one.data.length, other.data.length);
return new CombinedDocIdList(data);
}
public int size() {
return data.length;
}

View File

@@ -6,7 +6,7 @@ import java.util.Arrays;
import java.util.stream.LongStream;
public final class TermIdList {
private final long[] array;
public final long[] array;
public TermIdList(long[] array) {
this.array = array;

View File

@@ -59,7 +59,7 @@ public class RankingSearchSet implements SearchSet {
}
@Override
public boolean contains(int domainId, long documentMetadata) {
public boolean contains(int domainId) {
// This is the main check
if (set.contains(domainId) || set.isEmpty()) {

View File

@@ -7,6 +7,10 @@ public interface SearchSet {
* or if the documentMetadata vibes with the set
*
*/
boolean contains(int domainId, long documentMetadata);
boolean contains(int domainId);
default boolean imposesConstraint() {
return true;
}
}

View File

@@ -2,7 +2,7 @@ package nu.marginalia.index.searchset;
public class SearchSetAny implements SearchSet {
@Override
public boolean contains(int domainId, long meta) {
public boolean contains(int domainId) {
return true;
}
@@ -10,4 +10,9 @@ public class SearchSetAny implements SearchSet {
public String toString() {
return getClass().getSimpleName();
}
@Override
public boolean imposesConstraint() {
return false;
}
}

View File

@@ -14,7 +14,7 @@ public class SmallSearchSet implements SearchSet {
}
@Override
public boolean contains(int domainId, long meta) {
public boolean contains(int domainId) {
return entries.contains(domainId);
}

View File

@@ -1,7 +1,7 @@
package nu.marginalia.index.query;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import java.util.ArrayList;
import java.util.List;
@@ -18,15 +18,16 @@ import java.util.stream.Collectors;
public class IndexQuery {
private final List<EntrySource> sources;
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
private boolean prioritize = false;
public IndexQuery(List<EntrySource> sources)
public IndexQuery(EntrySource source, boolean prioritize)
{
this.sources = sources;
this.sources = List.of(source);
this.prioritize = prioritize;
}
public IndexQuery(EntrySource... sources)
{
this.sources = List.of(sources);
public boolean isPrioritized() {
return prioritize;
}
/** Adds a filter to the query. The filter will be applied to the results
* after they are read from the sources.
@@ -60,6 +61,7 @@ public class IndexQuery {
if (!fillBuffer(dest))
return;
for (var filter : inclusionFilter) {
filter.apply(dest);
@@ -73,6 +75,8 @@ public class IndexQuery {
private boolean fillBuffer(LongQueryBuffer dest) {
for (;;) {
dest.zero();
EntrySource source = sources.get(si);
source.read(dest);
@@ -102,6 +106,7 @@ public class IndexQuery {
return sb.toString();
}
}

View File

@@ -2,8 +2,6 @@ package nu.marginalia.index.query;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import java.util.List;
/** Builds a query.
* <p />
* Note: The query builder may omit predicates that are deemed redundant.
@@ -11,14 +9,13 @@ import java.util.List;
public interface IndexQueryBuilder {
/** Filters documents that also contain termId, within the full index.
*/
IndexQueryBuilder also(long termId);
IndexQueryBuilder also(long termId, IndexSearchBudget budget);
/** Excludes documents that contain termId, within the full index
*/
IndexQueryBuilder not(long termId);
IndexQueryBuilder not(long termId, IndexSearchBudget budget);
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterStep);
IndexQuery build();
}

View File

@@ -10,5 +10,5 @@ public class IndexSearchBudget {
}
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
public long timeLeft() { return timeout - System.currentTimeMillis(); }
public long timeLeft() { return Math.max(0, timeout - System.currentTimeMillis()); }
}

View File

@@ -1,71 +0,0 @@
package nu.marginalia.index.query.filter;
import nu.marginalia.array.page.LongQueryBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.StringJoiner;
public class QueryFilterAllOf implements QueryFilterStepIf {
private final List<QueryFilterStepIf> steps;
public QueryFilterAllOf(List<? extends QueryFilterStepIf> steps) {
this.steps = new ArrayList<>(steps.size());
for (var step : steps) {
if (step instanceof QueryFilterAllOf allOf) {
this.steps.addAll(allOf.steps);
}
else {
this.steps.add(step);
}
}
}
public QueryFilterAllOf(QueryFilterStepIf... steps) {
this(List.of(steps));
}
public double cost() {
double prod = 1.;
for (var step : steps) {
double cost = step.cost();
if (cost > 1.0) {
prod *= Math.log(cost);
}
else {
prod += cost;
}
}
return prod;
}
@Override
public boolean test(long value) {
for (var step : steps) {
if (!step.test(value))
return false;
}
return true;
}
public void apply(LongQueryBuffer buffer) {
if (steps.isEmpty())
return;
for (var step : steps) {
step.apply(buffer);
}
}
public String describe() {
StringJoiner sj = new StringJoiner(",", "[All Of: ", "]");
for (var step : steps) {
sj.add(step.describe());
}
return sj.toString();
}
}

View File

@@ -1,86 +0,0 @@
package nu.marginalia.index.query.filter;
import nu.marginalia.array.page.LongQueryBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.StringJoiner;
public class QueryFilterAnyOf implements QueryFilterStepIf {
private final List<QueryFilterStepIf> steps;
public QueryFilterAnyOf(List<? extends QueryFilterStepIf> steps) {
this.steps = new ArrayList<>(steps.size());
for (var step : steps) {
if (step instanceof QueryFilterAnyOf anyOf) {
this.steps.addAll(anyOf.steps);
} else {
this.steps.add(step);
}
}
}
public QueryFilterAnyOf(QueryFilterStepIf... steps) {
this(List.of(steps));
}
public double cost() {
return steps.stream().mapToDouble(QueryFilterStepIf::cost).sum();
}
@Override
public boolean test(long value) {
for (var step : steps) {
if (step.test(value))
return true;
}
return false;
}
public void apply(LongQueryBuffer buffer) {
if (steps.isEmpty())
return;
if (steps.size() == 1) {
steps.getFirst().apply(buffer);
return;
}
int start = 0;
final int endOfValidData = buffer.end; // End of valid data range
// The filters act as a partitioning function, where anything before buffer.end
// is "in", and is guaranteed to be sorted; and anything after buffer.end is "out"
// but no sorting guaranteed is provided.
// To provide a conditional filter, we re-sort the "out" range, slice it and apply filtering to the slice
for (var step : steps)
{
var slice = buffer.slice(start, endOfValidData);
slice.data.sort(0, slice.size());
step.apply(slice);
start += slice.end;
}
// After we're done, read and write pointers should be 0 and "end" should be the length of valid data,
// normally done through buffer.finalizeFiltering(); but that won't work here
buffer.reset();
buffer.end = start;
// After all filters have been applied, we must re-sort all the retained data
// to uphold the sortedness contract
buffer.data.sort(0, buffer.end);
}
public String describe() {
StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]");
for (var step : steps) {
sj.add(step.describe());
}
return sj.toString();
}
}

View File

@@ -4,11 +4,6 @@ import nu.marginalia.array.page.LongQueryBuffer;
public class QueryFilterLetThrough implements QueryFilterStepIf {
@Override
public boolean test(long value) {
return true;
}
@Override
public void apply(LongQueryBuffer buffer) {
buffer.retainAll();

View File

@@ -5,11 +5,6 @@ import nu.marginalia.array.page.LongQueryBuffer;
public class QueryFilterNoPass implements QueryFilterStepIf {
static final QueryFilterStepIf instance = new QueryFilterNoPass();
@Override
public boolean test(long value) {
return false;
}
public void apply(LongQueryBuffer buffer) {
buffer.finalizeFiltering();
}

View File

@@ -1,27 +0,0 @@
package nu.marginalia.index.query.filter;
import java.util.function.LongPredicate;
public class QueryFilterStepExcludeFromPredicate implements QueryFilterStepIf {
private final LongPredicate pred;
public QueryFilterStepExcludeFromPredicate(LongPredicate pred) {
this.pred = pred;
}
@Override
public boolean test(long value) {
return !pred.test(value);
}
@Override
public double cost() {
return 1;
}
@Override
public String describe() {
return "[!Predicate]";
}
}

View File

@@ -1,27 +0,0 @@
package nu.marginalia.index.query.filter;
import java.util.function.LongPredicate;
public class QueryFilterStepFromPredicate implements QueryFilterStepIf {
private final LongPredicate pred;
public QueryFilterStepFromPredicate(LongPredicate pred) {
this.pred = pred;
}
@Override
public boolean test(long value) {
return pred.test(value);
}
@Override
public double cost() {
return 1;
}
@Override
public String describe() {
return "[Predicate]";
}
}

View File

@@ -3,8 +3,6 @@ package nu.marginalia.index.query.filter;
import nu.marginalia.array.page.LongQueryBuffer;
public interface QueryFilterStepIf extends Comparable<QueryFilterStepIf> {
boolean test(long value);
double cost();
default int compareTo(QueryFilterStepIf other) {
@@ -22,17 +20,7 @@ public interface QueryFilterStepIf extends Comparable<QueryFilterStepIf> {
*
* <p>ASSUMPTION: buffer is sorted up until end.</p>
*/
default void apply(LongQueryBuffer buffer) {
while (buffer.hasMore()) {
if (test(buffer.currentValue())) {
buffer.retainAndAdvance();
}
else {
buffer.rejectAndAdvance();
}
}
buffer.finalizeFiltering();
}
void apply(LongQueryBuffer buffer);
}

View File

@@ -1,93 +0,0 @@
package nu.marginalia.index.query.filter;
import nu.marginalia.array.page.LongQueryBuffer;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
class QueryFilterStepIfTest {
private LongQueryBuffer createBuffer(long... data) {
return new LongQueryBuffer(data, data.length);
}
@Test
public void testPassThrough() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter = new QueryFilterLetThrough();
filter.apply(buffer);
assertArrayEquals(new long[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, buffer.copyData());
}
@Test
public void testNoPass() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter = new QueryFilterNoPass();
filter.apply(buffer);
assertArrayEquals(new long[]{}, buffer.copyData());
}
@Test
public void testIncludePredicate() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
filter.apply(buffer);
assertArrayEquals(new long[]{2, 4, 6, 8, 10}, buffer.copyData());
}
@Test
public void testExcludePredicate() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter = new QueryFilterStepExcludeFromPredicate(value -> value % 2 == 1);
filter.apply(buffer);
assertArrayEquals(new long[]{2, 4, 6, 8, 10}, buffer.copyData());
}
@Test
public void testSuccessiveApplication() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
var filter2 = new QueryFilterStepExcludeFromPredicate(value -> value <= 6);
filter1.apply(buffer);
filter2.apply(buffer);
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
}
@Test
public void testSuccessiveApplicationWithAllOf() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
var filter2 = new QueryFilterStepExcludeFromPredicate(value -> value <= 6);
new QueryFilterAllOf(List.of(filter1, filter2)).apply(buffer);
assertArrayEquals(new long[]{8, 10}, buffer.copyData());
}
@Test
public void testCombinedOrAnd() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0);
var filter2 = new QueryFilterStepFromPredicate(value -> value <= 5);
var filter1_2 = new QueryFilterAllOf(List.of(filter1, filter2));
var filter3 = new QueryFilterStepFromPredicate(value -> value % 2 == 1);
var filter4 = new QueryFilterStepFromPredicate(value -> value > 5);
var filter3_4 = new QueryFilterAllOf(List.of(filter3, filter4));
var filter12_34 = new QueryFilterAnyOf(List.of(filter1_2, filter3_4));
filter12_34.apply(buffer);
assertArrayEquals(new long[]{2, 4, 7, 9}, buffer.copyData());
}
@Test
public void testCombinedApplication() {
var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
var filter1 = new QueryFilterStepFromPredicate(value -> value % 3 == 0);
var filter2 = new QueryFilterStepFromPredicate(value -> value % 5 == 0);
var filter = new QueryFilterAnyOf(List.of(filter1, filter2));
filter.apply(buffer);
assertArrayEquals(new long[]{3, 5, 6, 9, 10}, buffer.copyData());
}
}

View File

@@ -16,6 +16,7 @@ import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
@@ -156,7 +157,7 @@ public class CombinedIndexReaderTest {
var reader = indexFactory.getCombinedIndexReader();
var query = reader
.findFullWord(kw("hello"))
.also(kw("world"))
.also(kw("world"), new IndexSearchBudget(10_000))
.build();
var buffer = new LongQueryBuffer(32);
@@ -198,8 +199,8 @@ public class CombinedIndexReaderTest {
var reader = indexFactory.getCombinedIndexReader();
var query = reader.findFullWord(kw("hello"))
.also(kw("world"))
.not(kw("goodbye"))
.also(kw("world"), new IndexSearchBudget(10_000))
.not(kw("goodbye"), new IndexSearchBudget(10_000))
.build();
var buffer = new LongQueryBuffer(32);
@@ -255,18 +256,19 @@ public class CombinedIndexReaderTest {
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
DocIdRewriter.identity(),
tmpDir);
var constructor =
new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
DocIdRewriter.identity(),
tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
}

View File

@@ -411,8 +411,6 @@ public class IndexQueryServiceIntegrationSmokeTest {
.filter(v -> (id % v) == 0)
.toArray();
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
long fullId = fullId(id);
ldbw.add(new DocdbUrlDetail(

View File

@@ -25,10 +25,10 @@ class RankingSearchSetTest {
set.write();
RankingSearchSet set2 = new RankingSearchSet("ACADEMIA", p);
assertTrue(set2.contains(1, 0));
assertTrue(set2.contains(5, 0));
assertTrue(set2.contains(7, 0));
assertTrue(set2.contains(9, 0));
assertTrue(set2.contains(1));
assertTrue(set2.contains(5));
assertTrue(set2.contains(7));
assertTrue(set2.contains(9));
Files.delete(p);

View File

@@ -21,7 +21,7 @@ dependencies {
implementation libs.lz4
implementation libs.guava
implementation project(':code:libraries:array:cpp')
implementation project(':code:libraries:native')
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

View File

@@ -1,26 +0,0 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
dependencies {
implementation libs.bundles.slf4j
}
apply from: "$rootProject.projectDir/srcsets.gradle"
// We use a custom task to compile the C++ code into a shared library
// with a shellscript as gradle's c++ tasks are kind of insufferable
tasks.register('compileCpp', Exec) {
inputs.files('compile.sh', 'src/main/cpp/cpphelpers.cpp', 'src/main/public/cpphelpers.h')
outputs.file 'resources/libcpp.so'
commandLine 'sh', 'compile.sh'
}
processResources.dependsOn('compileCpp')

View File

@@ -1,10 +0,0 @@
#!/usr/bin/env sh
CXX=${CXX:-g++}
if ! which ${CXX} > /dev/null; then
echo "g++ not found, skipping compilation"
exit 0
fi
${CXX} -O3 -march=native -std=c++14 -shared -Isrc/main/public src/main/cpp/*.cpp -o resources/libcpp.so

View File

@@ -1,8 +0,0 @@
#include <stdint.h>
#pragma once
extern "C" {
void ms_sort_64(int64_t* area, uint64_t start, uint64_t end);
void ms_sort_128(int64_t* area, uint64_t start, uint64_t end);
}

View File

@@ -0,0 +1,55 @@
package nu.marginalia.array;
import nu.marginalia.ffi.LinuxSystemCalls;
import java.io.IOException;
import java.lang.foreign.MemorySegment;
import java.nio.file.Path;
public class DirectFileReader implements AutoCloseable {
int fd;
public DirectFileReader(Path filename) throws IOException {
fd = LinuxSystemCalls.openDirect(filename);
if (fd < 0) {
throw new IOException("Error opening direct file: " + filename);
}
}
public void readAligned(LongArray dest, long offset) throws IOException {
readAligned(dest.getMemorySegment(), offset);
}
public void readAligned(MemorySegment segment, long offset) throws IOException {
if (LinuxSystemCalls.readAt(fd, segment, offset) != segment.byteSize()) {
throw new IOException("Failed to read data at " + offset);
}
}
public void readUnaligned(MemorySegment dest, MemorySegment alignedBuffer, long fileOffset) throws IOException {
int destOffset = 0;
for (long totalBytesToCopy = dest.byteSize(); totalBytesToCopy > 0; ) {
long alignedPageAddress = fileOffset & -4096L;
long srcPageOffset = fileOffset & 4095L;
long srcPageEnd = Math.min(srcPageOffset + totalBytesToCopy, 4096);
// wrapper for O_DIRECT pread
if (LinuxSystemCalls.readAt(fd, alignedBuffer, alignedPageAddress) != alignedBuffer.byteSize()) {
throw new IOException("Failed to read data at " + alignedPageAddress + " of size " + dest.byteSize());
}
int bytesToCopy = (int) (srcPageEnd - srcPageOffset);
MemorySegment.copy(alignedBuffer, srcPageOffset, dest, destOffset, bytesToCopy);
destOffset += bytesToCopy;
fileOffset += bytesToCopy;
totalBytesToCopy -= bytesToCopy;
}
}
public void close() {
LinuxSystemCalls.closeFd(fd);
}
}

View File

@@ -5,6 +5,7 @@ import nu.marginalia.array.page.UnsafeLongArray;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -25,6 +26,13 @@ public class LongArrayFactory {
return SegmentLongArray.onHeap(Arena.ofShared(), size);
}
public static LongArray onHeapManaged(Arena arena, long size) {
if (useUnsafe)
return UnsafeLongArray.wrap(arena.allocate(8 * size));
else
return SegmentLongArray.wrap(arena.allocate(8 * size));
}
public static LongArray mmapForReadingConfined(Path filename) throws IOException {
if (useUnsafe)
return UnsafeLongArray.fromMmapReadOnly(Arena.ofConfined(), filename, 0, Files.size(filename) / 8);
@@ -66,4 +74,13 @@ public class LongArrayFactory {
else
return SegmentLongArray.fromMmapReadWrite(Arena.ofShared(), filename, 0, size);
}
public static LongArray wrap(MemorySegment ms) {
if (useUnsafe) {
return UnsafeLongArray.wrap(ms);
}
else {
return SegmentLongArray.wrap(ms);
}
}
}

View File

@@ -20,6 +20,27 @@ public interface LongArraySearch extends LongArrayBase {
return fromIndex + low;
}
default long binarySearch2(long key, long fromIndex, long toIndex) {
long low = 0;
long len = toIndex - fromIndex;
while (len > 0) {
var half = len / 2;
long val = get(fromIndex + low + half);
if (val < key) {
low += len - half;
}
else if (val == key) {
low += half;
break;
}
len = half;
}
return fromIndex + low;
}
default long binarySearchN(int sz, long key, long fromIndex, long toIndex) {
long low = 0;
long high = (toIndex - fromIndex)/sz - 1;
@@ -33,6 +54,7 @@ public interface LongArraySearch extends LongArrayBase {
len = half;
}
return fromIndex + sz * low;
}

View File

@@ -1,7 +1,7 @@
package nu.marginalia.array.algo;
import nu.marginalia.NativeAlgos;
import nu.marginalia.array.LongArray;
import nu.marginalia.ffi.NativeAlgos;
import java.io.IOException;
import java.nio.channels.FileChannel;

View File

@@ -3,6 +3,8 @@ package nu.marginalia.array.page;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.ByteBuffer;
import java.util.Arrays;
@@ -36,13 +38,12 @@ public class LongQueryBuffer {
public LongQueryBuffer(int size) {
this.data = LongArrayFactory.onHeapConfined(size);
this.end = size;
this.end = 0;
}
public LongQueryBuffer(long[] data, int size) {
this.data = LongArrayFactory.onHeapConfined(size);
this.data.set(0, data);
this.end = size;
}
@@ -52,6 +53,26 @@ public class LongQueryBuffer {
return copy;
}
public long[] copyFilterData() {
long[] copy = new long[write];
data.forEach(0, write, (pos, val) -> copy[(int)pos]=val );
return copy;
}
public boolean fitsMore() {
return end < data.size();
}
public int addData(MemorySegment source, long sourceOffset, int nMax) {
int n = Math.min(nMax, (int) data.size() - end);
MemorySegment.copy(source, ValueLayout.JAVA_LONG, sourceOffset, data.getMemorySegment(), ValueLayout.JAVA_LONG, 8L * end, n);
end += n;
return n;
}
/** Dispose of the buffer and release resources */
public void dispose() {
data.close();

View File

@@ -29,6 +29,10 @@ public class SegmentLongArray implements LongArray {
this.arena = arena;
}
public static SegmentLongArray wrap(MemorySegment segment) {
return new SegmentLongArray(segment, null);
}
public static SegmentLongArray onHeap(Arena arena, long size) {
return new SegmentLongArray(arena.allocate(WORD_SIZE*size, 16), arena);
}

View File

@@ -50,6 +50,10 @@ public class UnsafeLongArray implements LongArray {
this.channel = channel;
}
public static UnsafeLongArray wrap(MemorySegment ms) {
return new UnsafeLongArray(ms, null);
}
public static UnsafeLongArray onHeap(Arena arena, long size) {
return new UnsafeLongArray(arena.allocate(WORD_SIZE*size, 16), arena);
}
@@ -77,6 +81,10 @@ public class UnsafeLongArray implements LongArray {
@Override
public LongArray range(long start, long end) {
assert end >= start : end + "<" + start;
assert end <= size() : end + "<" + size();
return new UnsafeLongArray(
segment.asSlice(
start * JAVA_LONG.byteSize(),
@@ -93,6 +101,7 @@ public class UnsafeLongArray implements LongArray {
@Override
public long get(long at) {
try {
return unsafe.getLong(segment.address() + at * JAVA_LONG.byteSize());
}
@@ -120,6 +129,7 @@ public class UnsafeLongArray implements LongArray {
@Override
public void set(long start, long end, LongBuffer buffer, int bufferStart) {
System.out.println("setA@"+ start + "#" + hashCode() + "-" + Thread.currentThread().threadId());
for (int i = 0; i < end - start; i++) {
unsafe.putLong(segment.address() + (start + i) * JAVA_LONG.byteSize(), buffer.get(bufferStart + i));
}

View File

@@ -0,0 +1,6 @@
package nu.marginalia.array.pool;
public enum BufferEvictionPolicy {
READ_ONCE,
CACHE
}

View File

@@ -0,0 +1,220 @@
package nu.marginalia.array.pool;
import nu.marginalia.ffi.LinuxSystemCalls;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
public class BufferPool implements AutoCloseable {
private static final Logger logger = LoggerFactory.getLogger(BufferPool.class);
private final MemoryPage[] pages;
private final long fileSize;
private final Arena arena;
private final int fd;
private final int pageSizeBytes;
private PoolLru poolLru;
private final AtomicInteger diskReadCount = new AtomicInteger();
private final AtomicInteger cacheReadCount = new AtomicInteger();
private volatile boolean running = true;
/** Unassociate all buffers with their addresses, ensuring they will not be cacheable */
public synchronized void reset() {
for (var page : pages) {
page.pageAddress(-1);
}
try {
poolLru.stop();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
poolLru = new PoolLru(pages);
}
public BufferPool(Path filename, int pageSizeBytes, int poolSize) {
this.fd = LinuxSystemCalls.openDirect(filename);
this.pageSizeBytes = pageSizeBytes;
try {
this.fileSize = Files.size(filename);
} catch (IOException e) {
throw new RuntimeException(e);
}
this.arena = Arena.ofShared();
this.pages = new UnsafeMemoryPage[poolSize];
MemorySegment memoryArea = arena.allocate((long) pageSizeBytes*poolSize, 4096);
for (int i = 0; i < pages.length; i++) {
if (Boolean.getBoolean("system.noSunMiscUnsafe")) {
pages[i] = (MemoryPage) new SegmentMemoryPage(memoryArea.asSlice((long) i*pageSizeBytes, pageSizeBytes), i);
}
else {
pages[i] = (MemoryPage) new UnsafeMemoryPage(memoryArea.asSlice((long) i*pageSizeBytes, pageSizeBytes), i);
}
}
this.poolLru = new PoolLru(pages);
Thread.ofPlatform().start(() -> {
int diskReadOld = 0;
int cacheReadOld = 0;
while (running) {
try {
TimeUnit.SECONDS.sleep(30);
} catch (InterruptedException e) {
logger.info("Sleep interrupted", e);
break;
}
int diskRead = diskReadCount.get();
int cacheRead = cacheReadCount.get();
int heldCount = 0;
for (var page : pages) {
if (page.isHeld()) {
heldCount++;
}
}
if (diskRead != diskReadOld || cacheRead != cacheReadOld) {
logger.info("[#{}:{}] Disk/Cached: {}/{}, heldCount={}/{}, fqs={}, rcc={}", hashCode(), pageSizeBytes, diskRead, cacheRead, heldCount, pages.length, poolLru.getFreeQueueSize(), poolLru.getReclaimCycles());
}
}
});
}
public void close() {
running = false;
try {
poolLru.stop();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
LinuxSystemCalls.closeFd(fd);
arena.close();
System.out.println("Disk read count: " + diskReadCount.get());
System.out.println("Cached read count: " + cacheReadCount.get());
}
@Nullable
public MemoryPage getExistingBufferForReading(long address) {
MemoryPage cachedBuffer = poolLru.get(address);
if (cachedBuffer != null && cachedBuffer.pageAddress() == address) {
// Try to acquire the page normally
if (cachedBuffer.acquireAsReader(address)) {
cacheReadCount.incrementAndGet();
return cachedBuffer;
}
if (cachedBuffer.pageAddress() != address)
return null;
// The page we are looking for is currently being written
waitForPageWrite(cachedBuffer);
if (cachedBuffer.acquireAsReader(address)) {
this.cacheReadCount.incrementAndGet();
return cachedBuffer;
}
}
return null;
}
public MemoryPage get(long address) {
// Look through available pages for the one we're looking for
MemoryPage buffer = getExistingBufferForReading(address);
if (buffer == null) {
buffer = read(address, true);
}
return buffer;
}
private MemoryPage read(long address, boolean acquire) {
// If the page is not available, read it from the caller's thread
if (address + pageSizeBytes > fileSize) {
throw new RuntimeException("Address " + address + " too large for page size " + pageSizeBytes + " and file size" + fileSize);
}
if ((address & 511) != 0) {
throw new RuntimeException("Address " + address + " not aligned");
}
MemoryPage buffer = acquireFreePage(address);
poolLru.register(buffer);
populateBuffer(buffer);
if (acquire) {
if (!buffer.pinCount().compareAndSet(-1, 1)) {
throw new IllegalStateException("Panic! Write lock was not held during write!");
}
}
else {
if (!buffer.pinCount().compareAndSet(-1, 0)) {
throw new IllegalStateException("Panic! Write lock was not held during write!");
}
}
diskReadCount.incrementAndGet();
return buffer;
}
private MemoryPage acquireFreePage(long address) {
for (;;) {
var free = poolLru.getFree();
if (free != null && free.acquireForWriting(address)) {
return free;
}
}
}
private void populateBuffer(MemoryPage buffer) {
if (getClass().desiredAssertionStatus()) {
buffer.getMemorySegment().set(ValueLayout.JAVA_INT, 0, 9999);
}
LinuxSystemCalls.readAt(fd, buffer.getMemorySegment(), buffer.pageAddress());
assert buffer.getMemorySegment().get(ValueLayout.JAVA_INT, 0) != 9999;
buffer.dirty(false);
if (buffer.pinCount().get() > 1) {
synchronized (buffer) {
buffer.notifyAll();
}
}
}
private void waitForPageWrite(MemoryPage page) {
if (!page.dirty()) {
return;
}
synchronized (page) {
while (page.dirty()) {
try {
page.wait(0, 1000);
}
catch (InterruptedException ex) {
throw new RuntimeException(ex);
}
}
}
}
}

View File

@@ -0,0 +1,32 @@
package nu.marginalia.array.pool;
import java.lang.foreign.MemorySegment;
import java.util.concurrent.atomic.AtomicInteger;
public interface MemoryPage extends AutoCloseable {
boolean isHeld();
MemorySegment getMemorySegment();
byte getByte(int offset);
int getInt(int offset);
long getLong(int offset);
int binarySearchLong(long key, int baseOffset, int fromIndex, int toIndex);
boolean acquireForWriting(long intendedAddress);
boolean acquireAsReader(long expectedAddress);
AtomicInteger pinCount();
void increaseClock(int val);
void touchClock(int val);
boolean decreaseClock();
long pageAddress();
void pageAddress(long address);
boolean dirty();
void dirty(boolean val);
void close();
}

View File

@@ -0,0 +1,186 @@
package nu.marginalia.array.pool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.LinkedHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.LockSupport;
import java.util.concurrent.locks.StampedLock;
/** LRU for pool buffers
* */
public class PoolLru {
private static final Logger logger = LoggerFactory.getLogger(PoolLru.class);
private final int maxSize;
private final LinkedHashMap<Long, MemoryPage> backingMap;
private final MemoryPage[] pages;
private final int[] freeQueue;
private volatile long reclaimCycles;
private final AtomicLong clockWriteIdx;
private final AtomicLong clockReadIdx;
private final StampedLock lock = new StampedLock();
private final Thread reclaimThread;
private volatile boolean running = true;
public PoolLru(MemoryPage[] pages) {
backingMap = new LinkedHashMap<>(pages.length, 0.75f);
this.pages = pages;
// Pre-assign all entries with nonsense memory locations
for (int i = 0; i < pages.length; i++) {
backingMap.put(-i-1L, pages[i]);
}
maxSize = backingMap.size();
freeQueue = new int[pages.length];
for (int i = 0; i < freeQueue.length; i++) {
freeQueue[i] = i;
}
clockReadIdx = new AtomicLong();
clockWriteIdx = new AtomicLong(freeQueue.length);
reclaimThread = Thread.ofPlatform().start(this::reclaimThread);
}
public void stop() throws InterruptedException {
running = false;
reclaimThread.interrupt();
reclaimThread.join();
}
/** Attempt to get a buffer already associated with the address */
public MemoryPage get(long address) {
var res = getAssociatedItem(address);
if (res != null) {
res.increaseClock(1);
}
return res;
}
private MemoryPage getAssociatedItem(long address) {
long stamp = lock.tryOptimisticRead();
MemoryPage res = backingMap.get(address);
if (lock.validate(stamp)) {
return res;
}
stamp = lock.readLock();
try {
return backingMap.get(address);
}
finally {
lock.unlockRead(stamp);
}
}
/** Associate the buffer with an address */
public void register(MemoryPage buffer) {
long stamp = lock.writeLock();
try {
backingMap.put(buffer.pageAddress(), buffer);
buffer.touchClock(1);
// Evict the last entry if we've exceeded the
while (backingMap.size() >= maxSize) {
backingMap.pollFirstEntry();
}
}
finally {
lock.unlockWrite(stamp);
}
}
public void deregister(MemoryPage buffer) {
long stamp = lock.writeLock();
try {
backingMap.remove(buffer.pageAddress(), buffer);
}
finally {
lock.unlockWrite(stamp);
}
}
/** Attempt to get a free buffer from the pool
*
* @return An unheld buffer, or null if the attempt failed
* */
public MemoryPage getFree() {
for (;;) {
var readIdx = clockReadIdx.get();
var writeIdx = clockWriteIdx.get();
if (writeIdx - readIdx == freeQueue.length / 4) {
LockSupport.unpark(reclaimThread);
} else if (readIdx == writeIdx) {
LockSupport.unpark(reclaimThread);
synchronized (this) {
try {
wait(0, 1000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
continue;
}
if (clockReadIdx.compareAndSet(readIdx, readIdx + 1)) {
return pages[freeQueue[(int) (readIdx % freeQueue.length)]];
}
}
}
private void reclaimThread() {
int pageIdx = 0;
while (running && !Thread.interrupted()) {
long readIdx = clockReadIdx.get();
long writeIdx = clockWriteIdx.get();
int queueSize = (int) (writeIdx - readIdx);
int targetQueueSize = freeQueue.length / 2;
if (queueSize >= targetQueueSize) {
LockSupport.parkNanos(100_000);
continue;
}
int toClaim = targetQueueSize - queueSize;
if (toClaim == 0)
continue;
++reclaimCycles;
do {
if (++pageIdx >= pages.length) {
pageIdx = 0;
}
var currentPage = pages[pageIdx];
if (currentPage.decreaseClock()) {
if (!currentPage.isHeld()) {
freeQueue[(int) (clockWriteIdx.getAndIncrement() % freeQueue.length)] = pageIdx;
deregister(pages[pageIdx]);
toClaim--;
}
else {
currentPage.touchClock(1);
}
}
} while (running && toClaim >= 0);
synchronized (this) {
notifyAll();
}
}
}
public int getFreeQueueSize() {
return (int) (clockWriteIdx.get() - clockReadIdx.get());
}
public long getReclaimCycles() {
return reclaimCycles;
}
}

View File

@@ -0,0 +1,163 @@
package nu.marginalia.array.pool;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.util.concurrent.atomic.AtomicInteger;
/** Variant of SegmentLongArray that uses Unsafe to access the memory.
* */
@SuppressWarnings("preview")
public class SegmentMemoryPage implements MemoryPage, AutoCloseable {
private final MemorySegment segment;
public final int ord;
private volatile long pageAddress = -1;
private volatile boolean dirty = false;
/** Pin count is used as a read-write condition.
* <p></p>
* When the pin count is 0, the page is free.
* When it is -1, it is held for writing.
* When it is greater than 0, it is held for reading.
*/
private final AtomicInteger pinCount = new AtomicInteger(0);
private final AtomicInteger clock = new AtomicInteger();
public SegmentMemoryPage(MemorySegment segment, int ord) {
this.segment = segment;
this.ord = ord;
}
public int hashCode() {
return (int) segment.address();
}
public boolean equals(Object obj) {
return obj == this;
}
@Override
public void increaseClock(int val) {
clock.addAndGet(val);
}
@Override
public void touchClock(int val) {
clock.set(val);
}
@Override
public boolean decreaseClock() {
for (;;) {
int cv = clock.get();
if (cv == 0)
return true;
if (clock.compareAndSet(cv, cv-1)) {
return cv == 1;
}
}
}
@Override
public long pageAddress() {
return pageAddress;
}
@Override
public void pageAddress(long address) {
this.pageAddress = address;
}
@Override
public AtomicInteger pinCount() {
return pinCount;
}
@Override
public boolean dirty() {
return dirty;
}
@Override
public void dirty(boolean val) {
this.dirty = val;
}
@Override
public boolean isHeld() {
return 0 != this.pinCount.get();
}
@Override
public byte getByte(int offset) {
return segment.get(ValueLayout.JAVA_BYTE, offset);
}
@Override
public int getInt(int offset) {
return segment.get(ValueLayout.JAVA_INT, offset);
}
@Override
public long getLong(int offset) {
return segment.get(ValueLayout.JAVA_LONG, offset);
}
@Override
public int binarySearchLong(long key, int baseOffset, int fromIndex, int toIndex) {
int low = 0;
int len = toIndex - fromIndex;
while (len > 0) {
var half = len / 2;
long val = getLong(baseOffset + 8 * (fromIndex + low + half));
if (val < key) {
low += len - half;
} else if (val == key) {
low += half;
break;
}
len = half;
}
return fromIndex + low;
}
@Override
public boolean acquireForWriting(long intendedAddress) {
if (pinCount.compareAndSet(0, -1)) {
pageAddress = intendedAddress;
dirty = true;
return true;
}
return false;
}
@Override
public boolean acquireAsReader(long expectedAddress) {
int pinCountVal;
while ((pinCountVal = pinCount.get()) >= 0) {
if (pinCount.compareAndSet(pinCountVal, pinCountVal+1)) {
if (pageAddress != expectedAddress) {
pinCount.decrementAndGet();
return false;
}
return true;
}
}
return false;
}
/** Close yields the buffer back to the pool (unless held by multiple readers), but does not deallocate it */
@Override
public void close() {
pinCount.decrementAndGet();
}
@Override
public MemorySegment getMemorySegment() {
return segment;
}
}

View File

@@ -0,0 +1,167 @@
package nu.marginalia.array.pool;
import nu.marginalia.array.page.UnsafeProvider;
import sun.misc.Unsafe;
import java.lang.foreign.MemorySegment;
import java.util.concurrent.atomic.AtomicInteger;
/** Variant of SegmentLongArray that uses Unsafe to access the memory.
* */
@SuppressWarnings("preview")
public class UnsafeMemoryPage implements MemoryPage, AutoCloseable {
private static final Unsafe unsafe = UnsafeProvider.getUnsafe();
private final MemorySegment segment;
public final int ord;
private volatile long pageAddress = -1;
private volatile boolean dirty = false;
/** Pin count is used as a read-write condition.
* <p></p>
* When the pin count is 0, the page is free.
* When it is -1, it is held for writing.
* When it is greater than 0, it is held for reading.
*/
private final AtomicInteger pinCount = new AtomicInteger(0);
private final AtomicInteger clock = new AtomicInteger();
public UnsafeMemoryPage(MemorySegment segment, int ord) {
this.segment = segment;
this.ord = ord;
}
public int hashCode() {
return (int) segment.address();
}
public boolean equals(Object obj) {
return obj == this;
}
public void increaseClock(int val) {
clock.addAndGet(val);
}
public void touchClock(int val) {
clock.set(val);
}
public boolean decreaseClock() {
for (;;) {
int cv = clock.get();
if (cv == 0)
return true;
if (clock.compareAndSet(cv, cv-1)) {
return cv == 1;
}
}
}
@Override
public long pageAddress() {
return pageAddress;
}
@Override
public void pageAddress(long address) {
this.pageAddress = address;
}
@Override
public AtomicInteger pinCount() {
return pinCount;
}
@Override
public boolean dirty() {
return dirty;
}
@Override
public void dirty(boolean val) {
this.dirty = val;
}
@Override
public boolean isHeld() {
return 0 != this.pinCount.get();
}
public byte getByte(int offset) {
assert offset >= 0;
assert offset + 1 <= segment.byteSize();
return unsafe.getByte(segment.address() + offset);
}
public int getInt(int offset) {
assert offset >= 0;
assert offset + 4 <= segment.byteSize();
return unsafe.getInt(segment.address() + offset);
}
public long getLong(int offset) {
assert offset >= 0;
assert offset + 8 <= segment.byteSize();
return unsafe.getLong(segment.address() + offset);
}
public int binarySearchLong(long key, int baseOffset, int fromIndex, int toIndex) {
int low = 0;
int len = toIndex - fromIndex;
while (len > 0) {
var half = len / 2;
long val = getLong(baseOffset + 8 * (fromIndex + low + half));
if (val < key) {
low += len - half;
}
else if (val == key) {
low += half;
break;
}
len = half;
}
return fromIndex + low;
}
@Override
public boolean acquireForWriting(long intendedAddress) {
if (pinCount.compareAndSet(0, -1)) {
pageAddress = intendedAddress;
dirty = true;
return true;
}
return false;
}
@Override
public boolean acquireAsReader(long expectedAddress) {
int pinCountVal;
while ((pinCountVal = pinCount.get()) >= 0) {
if (pinCount.compareAndSet(pinCountVal, pinCountVal+1)) {
if (pageAddress != expectedAddress) {
pinCount.decrementAndGet();
return false;
}
return true;
}
}
return false;
}
/** Close yields the buffer back to the pool (unless held by multiple readers), but does not deallocate it */
@Override
public void close() {
pinCount.decrementAndGet();
}
@Override
public MemorySegment getMemorySegment() {
return segment;
}
}

View File

@@ -1,8 +1,8 @@
package nu.marginalia.array.page;
import nu.marginalia.NativeAlgos;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.LongArraySort;
import nu.marginalia.ffi.NativeAlgos;
import org.openjdk.jmh.annotations.*;
import java.lang.foreign.Arena;

View File

@@ -0,0 +1,72 @@
package nu.marginalia;
import nu.marginalia.array.DirectFileReader;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ffi.LinuxSystemCalls;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.file.Path;
public class NativeAlgosTest {
@Test
public void test() throws IOException {
LongArray array = LongArrayFactory.mmapForWritingShared(Path.of("/tmp/test"), 1024);
for (int i = 0; i < 1024; i++) {
array.set(i, i);
}
array.close();
var ms = Arena.global().allocate(512, 8);
int fd = LinuxSystemCalls.openDirect(Path.of("/tmp/test"));
int ret = LinuxSystemCalls.readAt(fd, ms, 512);
System.out.println(ret);
System.out.println(ms.byteSize());
LinuxSystemCalls.closeFd(fd);
var array2 = LongArrayFactory.wrap(ms);
for (int i = 0; i < array2.size(); i++) {
System.out.println(i + ": " + array2.get(i));
}
}
@Test
void testDirectFileReader() throws IOException {
LongArray array = LongArrayFactory.mmapForWritingShared(Path.of("/tmp/test"), 1024);
for (int i = 0; i < 1024; i++) {
array.set(i, i);
}
array.close();
try (var dfr = new DirectFileReader(Path.of("/tmp/test"))) {
LongArray array2 = LongArrayFactory.onHeapConfined(64);
dfr.readAligned(array2, 0);
for (int i = 0; i < array2.size(); i++) {
System.out.println(i + ": " + array2.get(i));
}
}
var alignedBuffer = Arena.ofAuto().allocate(4096, 4096);
try (var dfr = new DirectFileReader(Path.of("/tmp/test"))) {
MemorySegment dest = Arena.ofAuto().allocate(504, 1);
dfr.readUnaligned(dest, alignedBuffer, 8);
for (int i = 0; i < dest.byteSize(); i+=8) {
System.out.println(i + ": " + dest.get(ValueLayout.JAVA_LONG, i));
}
dfr.readUnaligned(dest, alignedBuffer, 4000);
for (int i = 0; i < dest.byteSize(); i+=8) {
System.out.println(i + ": " + dest.get(ValueLayout.JAVA_LONG, i));
}
}
}
}

View File

@@ -0,0 +1,95 @@
package nu.marginalia.array;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.uring.UringFileReader;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.file.Path;
import java.util.List;
public class NativeAlgosTest {
@Test
public void test() throws IOException {
LongArray array = LongArrayFactory.mmapForWritingShared(Path.of("/tmp/test"), 1024);
for (int i = 0; i < 1024; i++) {
array.set(i, i);
}
array.close();
var ms = Arena.global().allocate(512, 8);
int fd = LinuxSystemCalls.openDirect(Path.of("/tmp/test"));
int ret = LinuxSystemCalls.readAt(fd, ms, 512);
System.out.println(ret);
System.out.println(ms.byteSize());
LinuxSystemCalls.closeFd(fd);
var array2 = LongArrayFactory.wrap(ms);
for (int i = 0; i < array2.size(); i++) {
System.out.println(i + ": " + array2.get(i));
}
}
@Test
void testDirectFileReader() throws IOException {
LongArray array = LongArrayFactory.mmapForWritingShared(Path.of("/tmp/test"), 1024);
for (int i = 0; i < 1024; i++) {
array.set(i, i);
}
array.close();
try (var dfr = new DirectFileReader(Path.of("/tmp/test"))) {
LongArray array2 = LongArrayFactory.onHeapConfined(64);
dfr.readAligned(array2, 0);
for (int i = 0; i < array2.size(); i++) {
System.out.println(i + ": " + array2.get(i));
}
}
var alignedBuffer = Arena.ofAuto().allocate(4096, 4096);
try (var dfr = new DirectFileReader(Path.of("/tmp/test"))) {
MemorySegment dest = Arena.ofAuto().allocate(504, 1);
dfr.readUnaligned(dest, alignedBuffer, 8);
for (int i = 0; i < dest.byteSize(); i+=8) {
System.out.println(i + ": " + dest.get(ValueLayout.JAVA_LONG, i));
}
dfr.readUnaligned(dest, alignedBuffer, 4000);
for (int i = 0; i < dest.byteSize(); i+=8) {
System.out.println(i + ": " + dest.get(ValueLayout.JAVA_LONG, i));
}
}
}
@Test
void testAioFileReader() throws IOException {
LongArray array = LongArrayFactory.mmapForWritingShared(Path.of("/tmp/test"), 1024);
for (int i = 0; i < 1024; i++) {
array.set(i, i);
}
array.close();
try (var dfr = new UringFileReader(Path.of("/tmp/test"), false)) {
MemorySegment buf1 = Arena.ofAuto().allocate(32, 8);
MemorySegment buf2 = Arena.ofAuto().allocate(16, 8);
dfr.read(List.of(buf1, buf2), List.of(0L, 8L));
for (int i = 0; i < buf1.byteSize(); i+=8) {
System.out.println(buf1.get(ValueLayout.JAVA_LONG, i));
}
for (int i = 0; i < buf2.byteSize(); i+=8) {
System.out.println(buf2.get(ValueLayout.JAVA_LONG, i));
}
}
}
}

View File

@@ -32,9 +32,21 @@ class LongArraySearchTest {
@Test
public void testEmptyRange() {
assertTrue(segmentArray.binarySearchN(2, 0, 0, 0) <= 0);
assertTrue(segmentArray.binarySearch(0, 0, 0) <= 0);
}
@Test
public void testBinarySearchNCase() {
try (var array = LongArrayFactory.onHeapConfined(1024)) {
for (int i = 0; i < 64; i++) {
array.set(2*i, 3*i);
array.set(2*i+1, i);
System.out.println(i + ":" + array.get(i));
}
System.out.println(array.binarySearchN(2, 3, 0, 64));
}
}
void binarySearchTester(LongArray array) {
for (int i = 0; i < array.size() * 3; i++) {

View File

@@ -0,0 +1,36 @@
package nu.marginalia.array.pool;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import static java.lang.foreign.ValueLayout.JAVA_LONG;
class UnsafeMemoryPageTest {
@Test
void binarySearchLong() {
MemorySegment ms = Arena.ofAuto().allocate(8 * 9);
ms.setAtIndex(JAVA_LONG, 0, 2260);
ms.setAtIndex(JAVA_LONG, 1, 2513);
ms.setAtIndex(JAVA_LONG, 2, 3531);
ms.setAtIndex(JAVA_LONG, 3, 4637);
ms.setAtIndex(JAVA_LONG, 4, 4975);
ms.setAtIndex(JAVA_LONG, 5, 6647);
ms.setAtIndex(JAVA_LONG, 6, 7179);
ms.setAtIndex(JAVA_LONG, 7, 7509);
ms.setAtIndex(JAVA_LONG, 8, 8000);
UnsafeMemoryPage page = new UnsafeMemoryPage(ms, 1);
Assertions.assertEquals(0, page.binarySearchLong(2260, 0, 0, 9));
Assertions.assertEquals(1, page.binarySearchLong(2513, 0, 0, 9));
Assertions.assertEquals(2, page.binarySearchLong(3531, 0, 0, 9));
Assertions.assertEquals(3, page.binarySearchLong(4637, 0, 0, 9));
Assertions.assertEquals(4, page.binarySearchLong(4975, 0, 0, 9));
Assertions.assertEquals(5, page.binarySearchLong(6647, 0, 0, 9));
Assertions.assertEquals(6, page.binarySearchLong(7179, 0, 0, 9));
Assertions.assertEquals(7, page.binarySearchLong(7509, 0, 0, 9));
Assertions.assertEquals(8, page.binarySearchLong(8000, 0, 0, 9));
}
}

View File

@@ -14,6 +14,8 @@ dependencies {
implementation project(':code:libraries:array')
implementation libs.bundles.slf4j
implementation libs.fastutil
implementation libs.notnull
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

View File

@@ -187,11 +187,8 @@ public class BTreeReader {
/** Move the pointer to the next layer in the direction of the provided key */
public void walkTowardChild(long key) {
final long searchStart = layerOffsets[layer] + pointerOffset;
final long nextLayerOffset = index.binarySearch(key, searchStart, searchStart + ctx.pageSize()) - searchStart;
layer --;
maxValueInBlock = index.get(searchStart + nextLayerOffset);
pointerOffset = ctx.pageSize() * (pointerOffset + nextLayerOffset);
@@ -250,21 +247,20 @@ public class BTreeReader {
long dataIndex = findData(buffer.currentValue());
if (dataIndex >= 0) {
buffer.retainAndAdvance();
if (buffer.hasMore() && buffer.currentValue() <= maxValueInBlock) {
long relOffsetInBlock = dataIndex - pointerOffset * ctx.entrySize;
long remainingTotal = dataBlockEnd - dataIndex;
long remainingBlock = ctx.pageSize() - relOffsetInBlock; // >= 0
long searchEnd = dataIndex + min(remainingTotal, remainingBlock);
data.retainN(buffer, ctx.entrySize, maxValueInBlock, dataIndex, searchEnd);
}
}
else {
buffer.rejectAndAdvance();
}
if (buffer.hasMore() && buffer.currentValue() <= maxValueInBlock) {
long relOffsetInBlock = dataIndex - pointerOffset * ctx.entrySize;
long remainingTotal = dataBlockEnd - dataIndex;
long remainingBlock = ctx.pageSize() - relOffsetInBlock; // >= 0
long searchEnd = dataIndex + min(remainingTotal, remainingBlock);
data.retainN(buffer, ctx.entrySize, maxValueInBlock, dataIndex, searchEnd);
}
}

Some files were not shown because too many files have changed in this diff Show More