1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

247 Commits

Author SHA1 Message Date
Viktor Lofgren
edd453531e (index) Partition keyword lexicons by language 2025-09-04 17:24:48 +02:00
Viktor Lofgren
096496ada1 (refac) Fold ft-anchor-keywords into converting-process 2025-09-03 13:04:30 +02:00
Viktor Lofgren
8ca6209260 (refac) Fold ft-anchor-keywords into converting-process 2025-09-03 13:03:38 +02:00
Viktor Lofgren
673c65d3c9 (refac) Fold term-frequency-dict into language-processing 2025-09-03 12:59:10 +02:00
Viktor Lofgren
acb9ec7b15 (refac) Consistently use 'languageIsoCode' for the language field 2025-09-03 12:54:18 +02:00
Viktor Lofgren
47079e05db (index) Store language information in the index journal 2025-09-03 12:33:24 +02:00
Viktor Lofgren
c93056e77f (refac) Clean up index code 2025-09-03 09:51:57 +02:00
Viktor Lofgren
6f7530e807 (refac) Clean up index code 2025-09-02 18:53:58 +02:00
Viktor Lofgren
87ce4a1b52 (refac) Clean up index code 2025-09-02 17:52:38 +02:00
Viktor Lofgren
52194cbe7a (refac) Clean up index code 2025-09-02 17:44:42 +02:00
Viktor Lofgren
fd1ac03c78 (refac) Clean up index code 2025-09-02 17:30:19 +02:00
Viktor Lofgren
5e5b86efb4 (refac) Clean up index code 2025-09-02 17:24:30 +02:00
Viktor Lofgren
f332ec6191 (refac) Clean up index code 2025-09-02 13:13:10 +02:00
Viktor Lofgren
c25c1af437 (refac) Clean up index code 2025-09-02 13:04:05 +02:00
Viktor Lofgren
eb0c911b45 (refac) Clean up index code 2025-09-02 12:50:07 +02:00
Viktor Lofgren
1979870ce4 (refac) Merge index-forward, index-reverse, index/query into index
The project has too many submodules, and it's a bit of a headache to navigate.
2025-09-02 12:30:42 +02:00
Viktor Lofgren
0ba2ea38e1 (index) Move reverse index into a distinct package 2025-09-02 11:59:56 +02:00
Viktor Lofgren
d6cfbceeea (index) Use a configurable hasher in the index 2025-09-01 13:44:28 +02:00
Viktor Lofgren
e369d200cc (refac) Simplify index data model by merging SearchParameters, SearchTerms and ResultRankingContext into a new object called SearchContext
The previous design was difficult to reason about as similar data was stored in several places, and different functions wanted different nearly identical (but not fully identical) context objects.

This is in preparation for making the keyword hash function configurable, as we want focus all the code that hashes keywords into one place.
2025-09-01 13:17:11 +02:00
Viktor Lofgren
946d64c8da (index) Make hash algorithm selection configurable, writer-side 2025-09-01 12:03:01 +02:00
Viktor Lofgren
42f043a60f (API) Add language parameter to the APIs 2025-09-01 09:33:39 +02:00
Viktor Lofgren
2f3950e0d5 (language) Roll KeywordExtractor into LanguageDefinition 2025-08-29 10:55:48 +02:00
Viktor Lofgren
61d803869e (language) Add support for languages with no POS-tagging
Clean up previous commit a bit.
2025-08-29 10:55:48 +02:00
Viktor Lofgren
df6434d177 (language) Add support for languages with no POS-tagging
This disables a lot of the smart keyword extraction,
which is mostly a crutch for helping English and similar
large languages to find relevant search results.

Smaller languages where a POS-tag model may not be available,
are probably fine with this disabled, as the search engine can
likely just rawdog the entire results list.
2025-08-29 10:55:48 +02:00
Viktor Lofgren
59519ed7c4 (language) Adjust languages.xml 2025-08-29 10:55:47 +02:00
Viktor Lofgren
874fc2d250 (language) Remove debug logging junk 2025-08-29 10:55:47 +02:00
Viktor Lofgren
69e8ec0eef (language) Fix subject keywords matcher with better rules and correct logic 2025-08-29 10:55:47 +02:00
Viktor Lofgren
a7eb5f54e6 (language) Clean up PosPattern, add tests 2025-08-29 10:55:47 +02:00
Viktor Lofgren
b29ba3e228 (language) Integrate new configurable POS patterns with keyword matchers 2025-08-29 10:55:47 +02:00
Viktor Lofgren
5fa5029c60 (language) Clean up UI 2025-08-29 10:55:47 +02:00
Viktor Lofgren
4257f60f00 (keywords) Fix logic error causing misidentification of some keywords 2025-08-29 10:55:47 +02:00
Viktor Lofgren
ce221d3a0e (language) Integrate old keyword extraction logic with new test tool 2025-08-29 10:55:47 +02:00
Viktor Lofgren
f0741142a3 (refac) Move keyword extraction into language processing 2025-08-29 10:55:47 +02:00
Viktor Lofgren
0899e4d895 (language) First version of the language processing debug tool 2025-08-29 10:55:47 +02:00
Viktor Lofgren
bbf7c5a1cb (language) Fix RDRPosTagger back to working order and integrate with SentenceExtractor 2025-08-29 10:55:47 +02:00
Viktor Lofgren
686a40e69b (language) Update modelling 2025-08-29 10:55:47 +02:00
Viktor Lofgren
8af254f44f (language) Parse PosPattern tags 2025-08-29 10:55:47 +02:00
Viktor Lofgren
2c21bd9287 (language) Add logging for unknown POS tags in PosPattern 2025-08-29 10:55:47 +02:00
Viktor Lofgren
f9645e2f00 (language) Enhance PosPattern to support wildcard variants in pattern matching 2025-08-29 10:55:47 +02:00
Viktor Lofgren
81e311b558 (language) POS-patterns WIP 2025-08-29 10:55:47 +02:00
Viktor Lofgren
507c09146a (language) Add support for downloadable resources, parsing POS tag configuration tags 2025-08-29 10:55:47 +02:00
Viktor Lofgren
f682425594 (language) Basic test for LanguageConfiguration 2025-08-29 10:55:47 +02:00
Viktor Lofgren
de67006c4f (language) Initial integration of new language configuration utility 2025-08-29 10:55:47 +02:00
Viktor Lofgren
eea32bb7b4 (language) Very basic language.xml loading off classpath 2025-08-29 10:55:47 +02:00
Viktor Lofgren
e976940a4e (config) Move slf4j config files to common:config 2025-08-29 10:55:47 +02:00
Viktor Lofgren
b564b33028 (language) Initial embryo for language configuration 2025-08-29 10:55:47 +02:00
Viktor Lofgren
1cca16a58e (language) Simplify language filters 2025-08-29 10:55:47 +02:00
Viktor Lofgren
70b4ed6d81 (ldb) Pipe language information into LDB database 2025-08-29 10:55:47 +02:00
Viktor Lofgren
45dc6412c1 (converter) Add language column to slop tables 2025-08-29 10:55:47 +02:00
Viktor Lofgren
b3b95edcb5 (converter) Bypass some of the grammar processing in the keyword extraction depending on language selection 2025-08-29 10:55:47 +02:00
Viktor Lofgren
338d300e1a (converter) Clean up spans-handling
This code was unnecessarily difficult to follow with repeated packing and re-packing of the same data.
2025-08-29 10:55:47 +02:00
Viktor Lofgren
fa685bf1f4 (converter) Add Language field to ProcessedDocumentDetails 2025-08-29 10:55:47 +02:00
Viktor Lofgren
d79a3e2b2a (converter) Tag documents by language in the index as a keyword 2025-08-29 10:55:47 +02:00
Viktor Lofgren
854382b2be (language-filter) Experimentally permit Swedish results to pass through the language filter 2025-08-29 10:55:47 +02:00
Viktor Lofgren
8710adbc2a (build) Reduce log noise during tests 2025-08-29 10:55:32 +02:00
Viktor Lofgren
acdf7b4785 (build) Add test-logger plugin to get better feedback during test execution 2025-08-29 10:41:35 +02:00
Viktor Lofgren
b5d27c1406 (search) Improve unicode support in displayTitle and displaySummary 2025-08-23 13:59:41 +02:00
Viktor Lofgren
55eb7dc116 (search) Improve unicode support in displayTitle and displaySummary 2025-08-23 13:57:51 +02:00
Viktor Lofgren
f0e8bc8baf (search) Improve unicode support in displayTitle and displaySummary 2025-08-23 13:56:19 +02:00
Viktor Lofgren
91a6ad2337 (search) Improve unicode support in displayTitle and displaySummary 2025-08-23 13:54:48 +02:00
Viktor Lofgren
9a182b9ddb (search) Use ADVERTISEMENT flag instead of TRACKING_ADVERTISEMENT when choosing to flag a result as having ads 2025-08-21 13:08:25 +02:00
Viktor Lofgren
fefbcf15ce (site) Make discord link point to chat.marginalia.nu and let nginx deal with figuring out which discord link to redirect to 2025-08-21 12:46:37 +02:00
Viktor Lofgren
9a789bf62d (array) Fix broken test 2025-08-18 09:10:58 +02:00
Viktor Lofgren
0525303b68 (index) Add upper limit to span lengths
Apparently outliers exist that are larger than SHORT_MAX.  This is probably not interesting, so we'll truncate at 8192 for now.

Adding logging statement to get more information about which spans these are so we can address the root cause down the line.
2025-08-17 08:44:57 +02:00
Viktor Lofgren
6953d65de5 (native) Register fixed fd:s for a nice io_uring speed boost 2025-08-16 13:48:11 +02:00
Viktor Lofgren
a7a18ced2e (native) Register fixed fd:s for a nice io_uring speed boost 2025-08-16 13:46:39 +02:00
Viktor Lofgren
7c94c941b2 (build) Correct rare scenario where root blocks could be generated with a negative size 2025-08-16 11:27:36 +02:00
Viktor Lofgren
ea99b62356 (build) Fix missing junit engine version 2025-08-16 11:01:32 +02:00
Viktor Lofgren
3dc21d34d8 (skiplist) Fix stability of getData fuzz test 2025-08-15 09:17:48 +02:00
Viktor Lofgren
51912e0176 (index) Tweak default values for IndexQueryExecution 2025-08-15 08:07:00 +02:00
Viktor Lofgren
de1b4d5372 (index) Make metrics make more sense by normalizing them by query budget 2025-08-15 03:16:22 +02:00
Viktor Lofgren
50ac926060 (index) Make metrics make more sense by normalizing them by query budget 2025-08-15 03:11:57 +02:00
Viktor Lofgren
d711ee75b5 (index) Add performance metrics 2025-08-15 00:48:52 +02:00
Viktor Lofgren
291ff0c4de (deps) Upgrade crawler commons to fix robots.txt-parser bug 2025-08-15 00:13:15 +02:00
Viktor
2fd2710355 Merge pull request #218 from MarginaliaSearch/o_direct_index
Replace document index btrees with a block based skiplist, get rid of mmap use O_DIRECT pread instead, use io_uring for positions reads
2025-08-14 23:57:09 +02:00
Viktor Lofgren
e3b957063d (native) Add fallbacks and configuration options for building on systems lacking liburing 2025-08-14 23:36:13 +02:00
Viktor Lofgren
aee262e5f6 (index) Safeguard against arena-leaks during exceptions
The GC would catch these eventually, but it's nice to clean up ourselves in a timely manner.
2025-08-14 19:28:31 +02:00
Viktor Lofgren
4a98a3c711 (skiplist) Move to a separate directory instead of in the btree module 2025-08-14 01:09:46 +02:00
Viktor Lofgren
68f52ca350 (test) Fix tests that works on my machine (TM) 2025-08-14 00:59:58 +02:00
Viktor Lofgren
2a2d951c2f (index) Fix unhinged default values for index.preparationThreads 2025-08-14 00:54:35 +02:00
Viktor Lofgren
379a1be074 (index) Add better timeout handling in UringQueue, fix slow memory leak on timeout exception 2025-08-14 00:52:50 +02:00
Viktor Lofgren
827aadafcd (uring) Reintroduce auto-slicing of excessively long read batches 2025-08-13 14:33:35 +02:00
Viktor Lofgren
aa7679d6ce (pool) Fix bug in exceptionally rare edge case leading to incorrect reads 2025-08-13 14:28:50 +02:00
Viktor Lofgren
6fe6de766d (pool) Fix SegmentMemoryPage storage 2025-08-13 13:17:14 +02:00
Viktor Lofgren
4245ac4c07 (doc) Update docs to reflect that we now need io_uring 2025-08-12 15:12:54 +02:00
Viktor Lofgren
1c49a0f5ad (index) Add system properties for toggling O_DIRECT mode for positions and spans 2025-08-12 15:11:13 +02:00
Viktor Lofgren
9a6e5f646d (docker) Add security_opt: seccomp:unconfined to docker-compose files
This is needed to access io_uring via docker.
2025-08-12 15:10:26 +02:00
Viktor Lofgren
fa92994a31 (uring) Fall back to simple I/O planning behavior when buffered mode is selected in UringFileReader 2025-08-11 23:44:38 +02:00
Viktor Lofgren
bc49406881 (build) Compatibility hack debian server 2025-08-11 23:26:53 +02:00
Viktor Lofgren
90325be447 (minor) Fix comments 2025-08-11 23:19:53 +02:00
Viktor Lofgren
dc89587af3 (index) Improve disk locality of the positions data 2025-08-11 21:17:12 +02:00
Viktor Lofgren
7b552afd6b (index) Improve disk locality of the positions data 2025-08-11 20:59:11 +02:00
Viktor Lofgren
73557edc67 (index) Improve disk locality of the positions data 2025-08-11 20:57:32 +02:00
Viktor Lofgren
83919e448a (index) Use O_DIRECT buffered reads for spans 2025-08-11 18:04:25 +02:00
Viktor Lofgren
6f5b75b84d (cleanup) Remove accidentally committed print stmt 2025-08-11 18:04:25 +02:00
Viktor Lofgren
db315e2813 (index) Use O_DIRECT position reads 2025-08-11 18:04:25 +02:00
Viktor Lofgren
e9977e08b7 (index) Block-align positions data
This will make reads more efficient, and possibly pave way for O_DIRECT reads of this data
2025-08-11 14:36:45 +02:00
Viktor Lofgren
1df3757e5f (native) Clean up io_uring code and check in execution queue, currently unused but nifty 2025-08-11 13:54:05 +02:00
Viktor Lofgren
ca283f9684 (native) Clean up native helpers and break them into their own library 2025-08-10 20:55:34 +02:00
Viktor Lofgren
85360e61b2 (index) Grow span writer buffer size
Apparently outlier spans can grow considerably large.
2025-08-10 17:20:38 +02:00
Viktor Lofgren
e2ccff21bc (index) Wait until ranking is finished in query execution 2025-08-09 23:40:30 +02:00
Viktor Lofgren
c5b5b0c699 (index) Permit fast termination of rejection filter execution 2025-08-09 23:36:59 +02:00
Viktor Lofgren
9a65946e22 (uring) Reduce queue size to 2048 to avoid ENOMEM on systems with default ulimits 2025-08-09 20:41:24 +02:00
Viktor Lofgren
1d2ab21e27 (index) Aggregate termdata reads into a single io_uring operation instead of one for each term 2025-08-09 17:43:18 +02:00
Viktor Lofgren
0610cc19ad (index) Fix double close errors 2025-08-09 17:05:38 +02:00
Viktor Lofgren
a676306a7f (skiplist) Fix bugs in seek operations 2025-08-09 17:00:27 +02:00
Viktor Lofgren
8d68cd14fb (skiplist) Even more aggressive forward pointers 2025-08-09 16:11:41 +02:00
Viktor Lofgren
4773c5a52b (index) Backport some changes made during performance evaluations 2025-08-09 15:19:41 +02:00
Viktor Lofgren
74bd562ae4 (index) Move I/O to separate threads to hopefully reduce contention a bit 2025-08-09 15:19:41 +02:00
Viktor Lofgren
c9751287b0 (index) Boost the buffer size used in PrioIndexEntrySource 2025-08-09 01:46:12 +02:00
Viktor Lofgren
5da24e3fc4 (index) Segregate full and priority query ranking 2025-08-09 00:39:31 +02:00
Viktor Lofgren
20a4e86eec (index) Use a confined arena in IndexResultRankingService 2025-08-08 22:08:35 +02:00
Viktor Lofgren
477a184948 (experiment) Allow early termination of include conditions in lookups 2025-08-08 19:12:54 +02:00
Viktor Lofgren
8940ce99db (perf) More statistics in perf testi 2025-08-08 18:57:25 +02:00
Viktor Lofgren
0ac0fa4dca (perf) More statistics in perf testi 2025-08-08 18:56:17 +02:00
Viktor Lofgren
942f15ef14 (skiplist) Use a linear-quadratic forward pointer scheme instead of an exponential 2025-08-08 16:57:15 +02:00
Viktor Lofgren
f668f33d5b (index) Tweaks and optimizations 2025-08-08 15:32:23 +02:00
Viktor Lofgren
6789975cd2 (index) Tweaks and optimizations 2025-08-08 15:30:48 +02:00
Viktor Lofgren
c3ba608776 (index) Split up evaluation tasks 2025-08-08 15:20:33 +02:00
Viktor Lofgren
733d2687fe (skiplist) Roll back the design change that segregated the values associated with documents into a separate file 2025-08-08 14:45:11 +02:00
Viktor Lofgren
f6daac8ed0 (index) MADVISE_RANDOM the index btrees 2025-08-07 21:14:28 +02:00
Viktor Lofgren
c2eeee4a06 (uring) Disable result set combination 2025-08-07 21:13:30 +02:00
Viktor Lofgren
3b0c701df4 (uring) Update uring timeout threshold 2025-08-07 20:13:25 +02:00
Viktor Lofgren
c6fb2db43b (index) Use a more SLA-aware execution scheduler 2025-08-07 20:13:15 +02:00
Viktor Lofgren
9bc8fe05ae (skiplist) Clean up search logic 2025-08-07 19:35:25 +02:00
Viktor Lofgren
440ffcf6f8 (skiplist) Fix bug in intersection-like algorithms 2025-08-07 02:18:14 +02:00
Viktor Lofgren
b07709cc72 (native) Disable expensive debug checks from uring code 2025-08-06 21:05:28 +02:00
Viktor Lofgren
9a6acdcbe0 (skiplist) Tag slow fuzz test as "slow" 2025-08-06 20:59:52 +02:00
Viktor Lofgren
23b9b0bf1b (index) Parametrize skip list block size and buffer pool sizes 2025-08-06 20:59:33 +02:00
Viktor Lofgren
749c8ed954 (pool) Correct buffer pool alignment 2025-08-06 20:56:34 +02:00
Viktor Lofgren
9f4b6939ca (skiplist) Fix condition for truncated block writing 2025-08-06 16:25:53 +02:00
Viktor Lofgren
1d08e44e8d (uring) Fadvise random access for uring buffered reads 2025-08-06 15:54:24 +02:00
Viktor Lofgren
fc2e156e78 (skiplist) Ensure docs file is a multiple BLOCK_SIZE bytes 2025-08-06 15:13:32 +02:00
Viktor Lofgren
5e68a89e9f (index) Improve error handling 2025-08-06 15:05:16 +02:00
Viktor Lofgren
d380661307 (index) Improve error handling 2025-08-06 14:31:06 +02:00
Viktor Lofgren
cccdf5c329 (pool) Check interrupt status in PoolLru's reclamation thread 2025-08-06 13:26:00 +02:00
Viktor Lofgren
f085b4ea12 (skiplist) Fix tests 2025-08-06 13:24:14 +02:00
Viktor Lofgren
e208f7d3ba (skiplist) Code clean up an added validation 2025-08-06 12:55:04 +02:00
Viktor Lofgren
b577085cb2 (pool) Use one contiguous memory allocation to encourage a HugePage allocation and reduce TLB thrashing 2025-08-06 12:49:46 +02:00
Viktor Lofgren
b9240476f6 (pool) Use one contiguous memory allocation to encourage a HugePage allocation and reduce TLB thrashing 2025-08-06 12:48:14 +02:00
Viktor Lofgren
8f50f86d0b (index) Fix error handling 2025-08-05 22:19:23 +02:00
Viktor Lofgren
e3b7ead7a9 (skiplist) Fix aggessive forward pointering 2025-08-05 20:47:38 +02:00
Viktor Lofgren
9a845ba604 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 19:10:58 +02:00
Viktor Lofgren
b9381f1603 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 17:35:13 +02:00
Viktor Lofgren
6a60127267 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 16:54:39 +02:00
Viktor Lofgren
e8ffcfbb19 (skiplist) Correct binary search implementation, fix intersection logic 2025-08-04 14:49:09 +02:00
Viktor Lofgren
caf0850f81 (index) Clean up code 2025-08-04 00:12:35 +02:00
Viktor Lofgren
62e3bb675e (btree) Remove O_DIRECT btree implementation 2025-08-03 23:43:31 +02:00
Viktor Lofgren
4dc3e7da7a (perf) Remove warmup from perf test, it's not doing much 2025-08-03 21:19:54 +02:00
Viktor Lofgren
92b09883ec (index) Switch from AIO to io_uring
Turns AIO is just bad especially with buffered I/O, io_uring performs strictly better in this scenario.
2025-08-03 21:19:54 +02:00
Viktor Lofgren
87082b4ef8 (index) Use AIO for reading spans and positions
This performs slightly worse in benchmarks, but that's likely caused by hitting the page cache.

AIO will tend to perform better when we see cache misses, which is the expected case in production on real-world data.
2025-08-03 21:19:54 +02:00
Viktor Lofgren
84d3f6087f (skiplist) Parametrize skip list block size, increase to 4K pages 2025-08-03 21:19:54 +02:00
Viktor Lofgren
f93ba371a5 (pool) Fix the LRU to not deadlock and be shit 2025-08-03 21:19:54 +02:00
Viktor Lofgren
5eec27c68d (pool) Fix for 32 bit rollover in clockHand for LRU 2025-08-03 21:19:54 +02:00
Viktor Lofgren
ab01576f91 (pool) Use one global buffer pool instead of many small ones, improved LRU with gclock reclamation, skip list optimization 2025-08-03 21:19:54 +02:00
Viktor Lofgren
054e5ccf44 (pool) Testing synchronized to see if I can find the deadlock 2025-08-03 21:19:54 +02:00
Viktor Lofgren
4351ea5128 (pool) Fix buffer leak 2025-08-03 21:19:54 +02:00
Viktor Lofgren
49cfa3a5e9 (pool) Decrease LQB size 2025-08-03 21:19:54 +02:00
Viktor Lofgren
683854b23f (pool) Fix logging 2025-08-03 21:19:54 +02:00
Viktor Lofgren
e880fa8945 (pool) Simplify locking in PoolLru 2025-08-03 21:19:54 +02:00
Viktor Lofgren
2482dc572e (pool) Grow free queue size 2025-08-03 21:19:54 +02:00
Viktor Lofgren
4589f11898 (pool) More stats 2025-08-03 21:19:54 +02:00
Viktor Lofgren
e43b6e610b (pool) Adjust pool reclamation strategy 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4772117a1f (skiplist) First stab at a skiplist replacement for btrees in the documents lists 2025-08-03 21:19:53 +02:00
Viktor Lofgren
3fc7ea521c (pool) Remove readahead and simplify the code 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4372f5af03 (pool) More performant LRU pool + better instructions queue 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4ad89b6c75 (pool) More performant LRU pool 2025-08-03 21:19:53 +02:00
Viktor Lofgren
ad0519e031 (index) Optimizations 2025-08-03 21:19:53 +02:00
Viktor Lofgren
596ece1230 (pool) Fix deadlock during pool starvation 2025-08-03 21:19:53 +02:00
Viktor Lofgren
07b6e1585b (pool) Bump pool sizes 2025-08-03 21:19:53 +02:00
Viktor Lofgren
cb5e2778eb (pool) Align the buffers with 512b 2025-08-03 21:19:53 +02:00
Viktor Lofgren
8f5ea7896c (btree) More debug information on numEntries = 0 scenario 2025-08-03 21:19:53 +02:00
Viktor Lofgren
76c398e0b1 (index) Fix lingering issues with previous optimizations 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4a94f04a8d (btree) Debug logging 2025-08-03 21:19:53 +02:00
Viktor Lofgren
df72f670d4 (btree) Fix queryData 2025-08-03 21:19:53 +02:00
Viktor Lofgren
eaa22c2f5a (*) Logging 2025-08-03 21:19:53 +02:00
Viktor Lofgren
7be173aeca (pool) Only dump statistics if they say anything 2025-08-03 21:19:53 +02:00
Viktor Lofgren
36685bdca7 (btree) Fix retain implementation 2025-08-03 21:19:53 +02:00
Viktor Lofgren
ad04057609 (btree) Add short circuits when retain/rejecting on an empty tree 2025-08-03 21:19:53 +02:00
Viktor Lofgren
eb76ae22e2 (perf) Use lqb size 512 in perf test 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4b858ab341 (btree) Cache retain/reject reads 2025-08-03 21:19:53 +02:00
Viktor Lofgren
c6e3c8aa3b (index) Focus pools to try to increase reuse 2025-08-03 21:19:53 +02:00
Viktor Lofgren
9128d3907c (index) Periodically dump buffer metrics 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4ef16d13d4 (index) O_DIRECT based buffer pool for index reads 2025-07-30 15:04:23 +02:00
Viktor Lofgren
838a5626ec (index) Reduce query buffer size 2025-07-27 21:42:04 +02:00
Viktor Lofgren
6b426209c7 (index) Restore threshold for work stealing in query execution 2025-07-27 21:41:46 +02:00
Viktor Lofgren
452b5731d9 (index) Lower threshold for work stealing in query execution 2025-07-27 21:35:11 +02:00
Viktor Lofgren
c91cf49630 (search) Disable scribe.rip substitution
It does not appear to work well
2025-07-27 19:40:58 +02:00
Viktor Lofgren
8503030f18 (search) Fix rare exception in scribe.rip substitution 2025-07-27 19:38:52 +02:00
Viktor Lofgren
744f7d3ef7 (search) Fix rare exception in scribe.rip substitution 2025-07-27 19:34:03 +02:00
Viktor Lofgren
215e12afe9 (index) Shrink query buffer size 2025-07-27 17:33:46 +02:00
Viktor Lofgren
2716bce918 (index) Adjust timeout logic for evaluation 2025-07-27 17:28:34 +02:00
Viktor Lofgren
caf2e6fbb7 (index) Adjust timeout logic for evaluation 2025-07-27 17:27:07 +02:00
Viktor Lofgren
233f0acfb1 (index) Further reduce query buffer size 2025-07-27 17:13:08 +02:00
Viktor Lofgren
e3a4ff02e9 (index) Abandon ongoing evaluation tasks if time is up 2025-07-27 17:04:01 +02:00
Viktor Lofgren
c786283ae1 (index) Reduce quer buffer size 2025-07-27 16:57:55 +02:00
Viktor Lofgren
a3f65ac0e0 (deploy) Trigger index deployment 2025-07-27 16:50:23 +02:00
Viktor
aba1a32af0 Merge pull request #217 from MarginaliaSearch/uncompressed-spans-file
Index optimizations
2025-07-27 16:49:27 +02:00
Viktor Lofgren
c9c442345b (perf) Change execution test to use processing rate instead of count 2025-07-27 16:39:51 +02:00
Viktor Lofgren
2e126ba30e (perf) Change execution test to use processing rate instead of count 2025-07-27 16:37:20 +02:00
Viktor Lofgren
2087985f49 (index) Implement work stealing in IndexQueryExecution as a better approach to backpressure 2025-07-27 16:29:57 +02:00
Viktor Lofgren
2b13ebd18b (index) Tweak evaluation backlog handling 2025-07-27 16:08:16 +02:00
Viktor Lofgren
6d92c125fe (perf) Fix perf test 2025-07-27 15:50:28 +02:00
Viktor Lofgren
f638cfa39a (index) Avoid possibility of negative timeout 2025-07-27 15:39:12 +02:00
Viktor Lofgren
89447c12af (index) Avoid possibility of negative timeout 2025-07-27 15:24:47 +02:00
Viktor Lofgren
c71fc46f04 (perf) Update perf test with execution scenario 2025-07-27 15:22:07 +02:00
Viktor Lofgren
f96874d828 (sequence) Implement a largestValue abort condition for minDistance()
This is something like 3500% faster in certain common scenarios
2025-07-27 15:05:50 +02:00
Viktor Lofgren
583a84d5a0 (index) Clean up of the index query execution logic 2025-07-27 15:05:50 +02:00
Viktor Lofgren
f65b946448 (index) Clean up code 2025-07-27 15:05:50 +02:00
Viktor Lofgren
3682815855 (index) Optimize sequence intersection for the n=1 case 2025-07-26 19:14:32 +02:00
Viktor Lofgren
3a94357660 (index) Perf test tool (WIP!) 2025-07-26 11:49:33 +02:00
Viktor Lofgren
673b0d3de1 (index) Perf test tool (WIP!) 2025-07-26 11:49:31 +02:00
Viktor Lofgren
ea942bc664 (spans) Add signature to the footer of the spans file, including a version byte so we can detect whether ot use the old or new decoding logic 2025-07-25 12:07:18 +02:00
Viktor Lofgren
7ed5083c54 (index) Don't split results into chunks 2025-07-25 11:45:07 +02:00
Viktor Lofgren
08bb2c097b (refac) Clean up the data model used in the index service 2025-07-25 10:54:07 +02:00
Viktor Lofgren
495fb325be (sequence) Correct sequence intersection bug introduced in optimizations 2025-07-25 10:48:33 +02:00
Viktor Lofgren
05c25bbaec (chore) Clean up 2025-07-24 23:43:27 +02:00
Viktor Lofgren
2a028b84f3 (chore) Clean up 2025-07-24 20:12:56 +02:00
Viktor Lofgren
a091a23623 (ranking) Remove unnecessary metadata retrievals 2025-07-24 20:08:09 +02:00
Viktor Lofgren
e8897acb45 (ranking) Remove unnecessary metadata retrievals 2025-07-24 20:05:39 +02:00
Viktor Lofgren
b89ffcf2be (index) Evaluate hash based idx mapping in ForwardIndexReader 2025-07-24 19:47:27 +02:00
Viktor Lofgren
dbcc9055b0 (index) Evaluate using MinMaxPriorityQueue as guts of ResultPriorityQueue 2025-07-24 19:31:51 +02:00
Viktor Lofgren
d9740557f4 (sequence) Optimize intersection logic with a fast abort condition 2025-07-24 19:04:10 +02:00
Viktor Lofgren
0d6cd015fd (index) Evaluate reading all spans at once 2025-07-24 18:34:11 +02:00
Viktor Lofgren
c6034efcc8 (index) Cache value of bitset cardinality for speed 2025-07-24 17:24:55 +02:00
Viktor Lofgren
76068014ad (index) More spans optimizations 2025-07-24 15:03:43 +02:00
Viktor Lofgren
1c3ed67127 (index) Byte align document spans 2025-07-24 14:06:14 +02:00
Viktor Lofgren
fc0cb6bd9a (index) Reserve a larger size for IntArrayList in SeqenceOperations.findIntersections 2025-07-24 14:03:44 +02:00
Viktor Lofgren
c2601bac78 (converter) Remove unnecessary allocation of a 16 KB byte buffer 2025-07-24 13:25:37 +02:00
Viktor Lofgren
f5641b72e9 (index) Fix broken test 2025-07-24 13:21:05 +02:00
Viktor Lofgren
36efe2e219 (index) Optimize PositionsFileReader for concurrent reads
In benchmarks this is roughly twice as fast as the previous approach.  Main caveat being we need multiple file descriptors to avoid read instruction serialization by the kernel.  This is undesirable since the reads are complete scattershot and can't be reordered by the kernel in a way that optimizes anything.
2025-07-24 13:20:54 +02:00
Viktor Lofgren
983fe3829e (spans) Evaluate uncompressed spans files
Span decompression appears to be somewhat of a performance bottleneck.  This change removes compression of the spans file.  The spans are still compressed in transit between the converter and index constructor at this stage.  The change is intentionally kept small to just evaluate the performance implications, change in file sizes, etc.
2025-07-23 18:10:41 +02:00
Viktor Lofgren
668c87aa86 (ssr) Drop Executor from SSR as it no longer exists 2025-07-23 13:55:41 +02:00
Viktor Lofgren
9d3f9adb05 Force redeploy of everything 2025-07-23 13:36:02 +02:00
Viktor
a43a1773f1 Merge pull request #216 from MarginaliaSearch/deprecate-executor
Architecture: Remove the separate executor service and roll it into the index service.
2025-07-23 13:32:42 +02:00
Viktor Lofgren
1e7a3a3c4f (docs) Update docs to reflect the change 2025-07-23 13:18:23 +02:00
Viktor Lofgren
62b696b1c3 (architecture) Remove the separate executor service and merge it into the index service
The primary motivation for this is that in production, the large number of partitioned services has lead to an intermittent exhaustion of available database connections, as each service has a connection pool.

The decision to have a separate executor service dates back from when the index service was very slow to start, and the executor didn't always spin off its memory-hungry tasks into separate processes, which meant the executor would sometimes OOM and crash, and it was undesirable to bring the index down with it.
2025-07-23 12:57:13 +02:00
Viktor Lofgren
f1a900f383 (search) Clean up front page mobile design a bit 2025-07-23 12:20:40 +02:00
Viktor Lofgren
700364b86d (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 15:08:20 +02:00
Viktor Lofgren
7e725ddaed (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 14:41:59 +02:00
Viktor Lofgren
120209e138 (sample) Diagnosing compression errors 2025-07-21 14:34:08 +02:00
Viktor Lofgren
a771a5b6ce (sample) Test different approach to decoding 2025-07-21 14:19:01 +02:00
Viktor Lofgren
dac5b54128 (sample) Better logging for sample errors 2025-07-21 14:03:58 +02:00
Viktor Lofgren
6cfb143c15 (sample) Compress sample HTML data and introduce new API for only getting requests 2025-07-21 13:55:25 +02:00
Viktor Lofgren
23c818281b (converter) Reduce DomSample logging for NOT_FOUND 2025-07-21 13:37:55 +02:00
Viktor Lofgren
8aad253cf6 (converter) Add more logging around dom sample data retrieval errors 2025-07-21 13:26:38 +02:00
Viktor Lofgren
556d7af9dc Reapply "(grpc) Use grpc-netty instead of grpc-netty-shaded"
This reverts commit b7a5219ed3.
2025-07-21 13:23:32 +02:00
397 changed files with 10552 additions and 8038 deletions

View File

@@ -6,6 +6,7 @@ plugins {
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
// https://github.com/GoogleContainerTools/jib/issues/3347
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
id 'com.adarshr.test-logger' version '4.0.0'
}
group 'marginalia'
@@ -31,7 +32,10 @@ subprojects.forEach {it ->
jvmArgs += ['--enable-preview']
}
it.tasks.withType(Test).configureEach {
jvmArgs += ['--enable-preview']
jvmArgs += ['--enable-preview',
'--enable-native-access=ALL-UNNAMED',
'--sun-misc-unsafe-memory-access=allow',
'-Dsystem.uringQueueCount=1']
}
// Enable reproducible builds for the entire project

View File

@@ -6,7 +6,6 @@ import com.google.inject.name.Named;
import gnu.trove.list.TLongList;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -14,7 +13,6 @@ import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.sql.Connection;
import java.sql.DriverManager;
@@ -104,7 +102,7 @@ public class DocumentDbReader {
}
try (var stmt = connection.prepareStatement("""
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
SELECT ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
FROM DOCUMENT WHERE ID = ?
""")) {
for (int i = 0; i < ids.size(); i++) {
@@ -118,6 +116,7 @@ public class DocumentDbReader {
url,
rs.getString("TITLE"),
rs.getString("DESCRIPTION"),
rs.getString("LANGUAGE"),
rs.getDouble("QUALITY"),
rs.getString("FORMAT"),
rs.getInt("FEATURES"),

View File

@@ -41,8 +41,8 @@ public class DocumentDbWriter {
public void add(List<DocdbUrlDetail> docdbUrlDetail) throws SQLException {
try (var stmt = connection.prepareStatement("""
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""")) {
int i = 0;
@@ -54,15 +54,16 @@ public class DocumentDbWriter {
stmt.setString(3, document.title());
stmt.setString(4, document.description());
stmt.setInt(5, document.wordsTotal());
stmt.setString(6, document.format());
stmt.setInt(7, document.features());
stmt.setLong(8, document.dataHash());
stmt.setDouble(9, document.urlQuality());
stmt.setString(5, document.language());
stmt.setInt(6, document.wordsTotal());
stmt.setString(7, document.format());
stmt.setInt(8, document.features());
stmt.setLong(9, document.dataHash());
stmt.setDouble(10, document.urlQuality());
if (document.pubYear() == null) {
stmt.setInt(10, 0);
stmt.setInt(11, 0);
} else {
stmt.setInt(10, document.pubYear());
stmt.setInt(11, document.pubYear());
}
stmt.addBatch();

View File

@@ -6,6 +6,7 @@ public record DocdbUrlDetail(long urlId,
EdgeUrl url,
String title,
String description,
String language,
double urlQuality,
String format,
int features,

View File

@@ -6,6 +6,7 @@ CREATE TABLE DOCUMENT (
STATE INT,
TITLE TEXT NOT NULL,
DESCRIPTION TEXT NOT NULL,
LANGUAGE TEXT NOT NULL,
WORDS_TOTAL INTEGER NOT NULL,
FORMAT TEXT NOT NULL,

View File

@@ -23,6 +23,7 @@ public class DocumentDbWriterTest {
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
"Test",
"This is a test",
"en",
-4.,
"XHTML",
5,

View File

@@ -105,8 +105,6 @@ public enum HtmlFeature {
}
public int getFeatureBit() {
if (getClass().desiredAssertionStatus() && ordinal() >= 32)
throw new IllegalStateException("Attempting to extract feature bit of " + name() + ", with ordinal " + ordinal());
return (1<< ordinal());
}
}

View File

@@ -7,7 +7,6 @@ public enum ServiceId {
Search("search-service"),
Index("index-service"),
Query("query-service"),
Executor("executor-service"),
Control("control-service"),

View File

@@ -1,9 +1,9 @@
package nu.marginalia.service.server;
import io.grpc.Server;
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
import io.grpc.netty.shaded.io.netty.channel.nio.NioEventLoopGroup;
import io.grpc.netty.shaded.io.netty.channel.socket.nio.NioServerSocketChannel;
import io.grpc.netty.NettyServerBuilder;
import io.netty.channel.nio.NioEventLoopGroup;
import io.netty.channel.socket.nio.NioServerSocketChannel;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition;

View File

@@ -189,7 +189,7 @@ public class ExecutorClient {
String uriPath = "/transfer/file/" + fileStorage.id();
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
if (endpoints.isEmpty()) {
throw new RuntimeException("No endpoints for node " + fileStorage.node());
}

View File

@@ -22,7 +22,6 @@ dependencies {
implementation project(':code:processes:ping-process')
implementation project(':code:processes:new-domain-process')
implementation project(':code:processes:converting-process')
implementation project(':code:processes:index-constructor-process')
implementation project(':code:common:config')
implementation project(':code:common:model')
@@ -34,7 +33,7 @@ dependencies {
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:message-queue')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:language-processing')
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:live-capture:api')

View File

@@ -5,7 +5,6 @@ import com.google.inject.Singleton;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.ConverterMain;
import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.index.IndexConstructorMain;
import nu.marginalia.livecrawler.LiveCrawlerMain;
import nu.marginalia.loading.LoaderMain;
import nu.marginalia.ndp.NdpMain;
@@ -57,7 +56,7 @@ public class ProcessSpawnerService {
LIVE_CRAWLER(LiveCrawlerMain.class),
CONVERTER(ConverterMain.class),
LOADER(LoaderMain.class),
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
NDP(NdpMain.class),
EXPORT_TASKS(ExportTasksMain.class),
;
@@ -66,6 +65,9 @@ public class ProcessSpawnerService {
ProcessId(Class<? extends ProcessMainClass> mainClass) {
this.mainClass = mainClass.getName();
}
ProcessId(String mainClassFullName) {
this.mainClass = mainClassFullName;
}
List<String> envOpts() {
String variable = switch (this) {

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import com.google.inject.Inject;
import nu.marginalia.storage.FileStorageService;

View File

@@ -1,5 +1,5 @@
The execution subsystem is responsible for the execution of long running tasks on each
index node. It lives in the [executor-service](../services-core/executor-service) module.
index node. It lives in the [index-service](../services-core/index-service) module.
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
which permits program state to survive crashes and reboots.

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@@ -41,7 +41,22 @@ public class DomSampleClient {
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample");
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}
}
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
try {
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
return Optional.of(val);
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}

View File

@@ -7,6 +7,7 @@ option java_multiple_files=true;
service DomSampleApi {
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
}
@@ -19,10 +20,16 @@ message RpcBooleanRsp {
bool answer = 1;
}
message RpcDomainSampleRequests {
string domainName = 1;
string url = 2;
repeated RpcOutgoingRequest outgoingRequests = 5;
}
message RpcDomainSample {
string domainName = 1;
string url = 2;
string htmlSample = 3;
bytes htmlSampleZstd = 3;
bool accepted_popover = 4;
repeated RpcOutgoingRequest outgoingRequests = 5;
}

View File

@@ -31,6 +31,7 @@ dependencies {
implementation libs.jsoup
implementation libs.opencsv
implementation libs.slop
implementation libs.zstd
implementation libs.sqlite
implementation libs.bundles.slf4j
implementation libs.commons.lang3

View File

@@ -1,6 +1,8 @@
package nu.marginalia.domsample;
import com.github.luben.zstd.Zstd;
import com.google.inject.Inject;
import com.google.protobuf.ByteString;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.api.domsample.*;
@@ -9,6 +11,7 @@ import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.List;
public class DomSampleGrpcService
@@ -42,7 +45,36 @@ public class DomSampleGrpcService
}
// Grab the first sample
RpcDomainSample.Builder response = convert(dbRecords.getFirst());
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
}
catch (Exception e) {
logger.error("Error in getSample()", e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
if (dbRecords.isEmpty()) {
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
return;
}
// Grab the first sample
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
@@ -87,7 +119,7 @@ public class DomSampleGrpcService
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
for (var record : dbRecords) {
responseObserver.onNext(convert(record).build());
responseObserver.onNext(convertFullSample(record).build());
}
responseObserver.onCompleted();
@@ -98,12 +130,14 @@ public class DomSampleGrpcService
}
}
private RpcDomainSample.Builder convert(DomSampleDb.Sample dbSample) {
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
var sampleBuilder = RpcDomainSample.newBuilder()
.setDomainName(dbSample.domain())
.setAcceptedPopover(dbSample.acceptedPopover())
.setHtmlSample(dbSample.sample());
.setHtmlSampleZstd(htmlZstd);
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
@@ -120,4 +154,23 @@ public class DomSampleGrpcService
return sampleBuilder;
}
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
.setDomainName(dbSample.domain());
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
.setUrl(req.uri().toString())
.setMethod(switch (req.method().toUpperCase())
{
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
default -> RpcOutgoingRequest.RequestMethod.OTHER;
})
.setTimestamp(req.timestamp());
}
return sampleBuilder;
}
}

View File

@@ -87,7 +87,7 @@ class FeedFetcherServiceTest extends AbstractModule {
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
bind(HikariDataSource.class).toInstance(dataSource);
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Index, 1, "", "", 0, UUID.randomUUID()));
bind(Integer.class).annotatedWith(Names.named("wmsa-system-node")).toInstance(1);
}

View File

@@ -22,7 +22,6 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:index:query')
implementation project(':code:libraries:language-processing')
implementation libs.bundles.slf4j

View File

@@ -2,8 +2,8 @@ package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
import java.util.ArrayList;
import java.util.List;

View File

@@ -9,7 +9,7 @@ import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup;
import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.model.EdgeUrl;
import java.util.ArrayList;
@@ -28,6 +28,7 @@ public class QueryProtobufCodec {
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
builder.setHumanQuery(request.getHumanQuery());
builder.setLangIsoCode(query.langIsoCode);
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
@@ -76,6 +77,7 @@ public class QueryProtobufCodec {
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
builder.setHumanQuery(humanQuery);
builder.setLangIsoCode(query.langIsoCode);
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
@@ -114,6 +116,7 @@ public class QueryProtobufCodec {
QueryStrategy.valueOf(request.getQueryStrategy()),
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
request.getLangIsoCode(),
request.getPagination().getPage()
);
}
@@ -304,7 +307,6 @@ public class QueryProtobufCodec {
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
specs.getDomainsList(),
specs.getSearchSetIdentifier(),
specs.getHumanQuery(),
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
@@ -336,7 +338,8 @@ public class QueryProtobufCodec {
.setPagination(RpcQsQueryPagination.newBuilder()
.setPage(params.page())
.setPageSize(Math.min(100, params.limits().getResultsTotal()))
.build());
.build())
.setLangIsoCode(params.langIsoCode());
if (params.nearDomain() != null)
builder.setNearDomain(params.nearDomain());

View File

@@ -1,19 +1,24 @@
package nu.marginalia.api.searchquery.model.query;
import java.util.*;
import java.util.List;
public class ProcessedQuery {
public final SearchSpecification specs;
public final List<String> searchTermsHuman;
public final String domain;
public final String langIsoCode;
public ProcessedQuery(SearchSpecification specs, List<String> searchTermsHuman, String domain) {
public ProcessedQuery(SearchSpecification specs,
List<String> searchTermsHuman,
String domain,
String langIsoCode) {
this.specs = specs;
this.searchTermsHuman = searchTermsHuman;
this.domain = domain;
this.langIsoCode = langIsoCode;
}
public ProcessedQuery(SearchSpecification justSpecs) {
this(justSpecs, List.of(), null);
this(justSpecs, List.of(), null, "en");
}
}

View File

@@ -2,8 +2,6 @@ package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import javax.annotation.Nullable;
import java.util.List;
@@ -26,10 +24,11 @@ public record QueryParams(
QueryStrategy queryStrategy,
RpcTemporalBias.Bias temporalBias,
NsfwFilterTier filterTier,
String langIsoCode,
int page
)
{
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier, String langIsoCode) {
this(query, null,
List.of(),
List.of(),
@@ -45,6 +44,7 @@ public record QueryParams(
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
filterTier,
langIsoCode,
1 // page
);
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query.limit;
package nu.marginalia.api.searchquery.model.query;
public enum QueryStrategy {
SENTENCE,

View File

@@ -2,8 +2,6 @@ package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import javax.annotation.Nullable;
import java.util.List;
@@ -18,8 +16,6 @@ public class SearchSpecification {
public String searchSetIdentifier;
public final String humanQuery;
public SpecificationLimit quality;
public SpecificationLimit year;
public SpecificationLimit size;
@@ -35,7 +31,6 @@ public class SearchSpecification {
public SearchSpecification(SearchQuery query,
List<Integer> domains,
String searchSetIdentifier,
String humanQuery,
SpecificationLimit quality,
SpecificationLimit year,
SpecificationLimit size,
@@ -47,7 +42,6 @@ public class SearchSpecification {
this.query = query;
this.domains = domains;
this.searchSetIdentifier = searchSetIdentifier;
this.humanQuery = humanQuery;
this.quality = quality;
this.year = year;
this.size = size;
@@ -73,10 +67,6 @@ public class SearchSpecification {
return this.searchSetIdentifier;
}
public String getHumanQuery() {
return this.humanQuery;
}
public SpecificationLimit getQuality() {
return this.quality;
}
@@ -106,14 +96,13 @@ public class SearchSpecification {
}
public String toString() {
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
}
public static class SearchSpecificationBuilder {
private SearchQuery query;
private List<Integer> domains;
private String searchSetIdentifier;
private String humanQuery;
private SpecificationLimit quality$value;
private boolean quality$set;
private SpecificationLimit year$value;
@@ -144,11 +133,6 @@ public class SearchSpecification {
return this;
}
public SearchSpecificationBuilder humanQuery(String humanQuery) {
this.humanQuery = humanQuery;
return this;
}
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
this.quality$value = quality;
this.quality$set = true;
@@ -205,11 +189,7 @@ public class SearchSpecification {
if (!this.rank$set) {
rank$value = SpecificationLimit.none();
}
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
}
public String toString() {
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
}
}
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query.limit;
package nu.marginalia.api.searchquery.model.query;
public record SpecificationLimit(SpecificationLimitType type, int value) {
public boolean isNone() {

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query.limit;
package nu.marginalia.api.searchquery.model.query;
public enum SpecificationLimitType {
NONE,

View File

@@ -1,56 +0,0 @@
package nu.marginalia.api.searchquery.model.results;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import java.util.BitSet;
public class ResultRankingContext {
private final int docCount;
public final RpcResultRankingParameters params;
public final BitSet regularMask;
public final BitSet ngramsMask;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt fullCounts;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt priorityCounts;
public ResultRankingContext(int docCount,
RpcResultRankingParameters params,
BitSet ngramsMask,
BitSet regularMask,
CqDataInt fullCounts,
CqDataInt prioCounts)
{
this.docCount = docCount;
this.params = params;
this.ngramsMask = ngramsMask;
this.regularMask = regularMask;
this.fullCounts = fullCounts;
this.priorityCounts = prioCounts;
}
public int termFreqDocCount() {
return docCount;
}
@Override
public String toString() {
return "ResultRankingContext{" +
"docCount=" + docCount +
", params=" + params +
", regularMask=" + regularMask +
", ngramsMask=" + ngramsMask +
", fullCounts=" + fullCounts +
", priorityCounts=" + priorityCounts +
'}';
}
}

View File

@@ -34,6 +34,7 @@ message RpcQsQuery {
RpcQsQueryPagination pagination = 17;
NSFW_FILTER_TIER nsfwFilterTier = 18;
string langIsoCode = 19;
enum NSFW_FILTER_TIER {
NONE = 0;
@@ -88,6 +89,7 @@ message RpcIndexQuery {
RpcResultRankingParameters parameters = 12;
NSFW_FILTER_TIER nsfwFilterTier = 13;
string langIsoCode = 14;
enum NSFW_FILTER_TIER {
NONE = 0;

View File

@@ -3,7 +3,7 @@ package nu.marginalia.index.client;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import org.junit.jupiter.api.Test;
import java.util.List;

View File

@@ -22,18 +22,13 @@ dependencies {
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:query')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:openzim')
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:processes:converting-process:ft-keyword-extraction')
implementation libs.bundles.slf4j

View File

@@ -8,8 +8,8 @@ import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
@@ -34,8 +34,6 @@ public class QueryFactory {
this.queryExpansion = queryExpansion;
}
public ProcessedQuery createQuery(QueryParams params,
@Nullable RpcResultRankingParameters rankingParams) {
final var query = params.humanQuery();
@@ -153,7 +151,6 @@ public class QueryFactory {
var specsBuilder = SearchSpecification.builder()
.query(queryBuilder.build())
.humanQuery(query)
.quality(qualityLimit)
.year(year)
.size(size)
@@ -170,7 +167,7 @@ public class QueryFactory {
specs.query.searchTermsPriority.addAll(params.tacitPriority());
specs.query.searchTermsExclude.addAll(params.tacitExcludes());
return new ProcessedQuery(specs, searchTermsHuman, domain);
return new ProcessedQuery(specs, searchTermsHuman, domain, params.langIsoCode());
}
private void analyzeSearchTerm(List<String> problems, String str, String displayStr) {

View File

@@ -1,7 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.util.transform_list.TransformList;

View File

@@ -1,7 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser.token;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
public sealed interface QueryToken {
String str();

View File

@@ -3,14 +3,9 @@ package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.Assertions;
@@ -60,6 +55,7 @@ public class QueryFactoryTest {
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
NsfwFilterTier.OFF,
"en",
0), null).specs;
}
@@ -216,6 +212,12 @@ public class QueryFactoryTest {
}
@Test
public void testExpansion10() {
var subquery = parseAndGetSpecs("when was captain james cook born");
System.out.println(subquery);
}
@Test
public void testContractionWordNum() {
var subquery = parseAndGetSpecs("glove 80");
@@ -241,7 +243,6 @@ public class QueryFactoryTest {
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
Assertions.assertEquals("\"bob's cars\"", subquery.humanQuery);
}
@Test

View File

@@ -22,8 +22,13 @@ dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:skiplist')
implementation project(':code:libraries:native')
implementation project(':code:libraries:random-write-funnel')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:message-queue')
implementation project(':code:common:db')
implementation project(':code:common:config')
@@ -32,11 +37,9 @@ dependencies {
implementation project(':code:common:service')
implementation project(':code:processes:converting-process:model')
implementation project(':code:processes:process-mq-api')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:index-forward')
implementation project(':code:index:index-reverse')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
@@ -74,7 +77,7 @@ dependencies {
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:libraries:language-processing')
testImplementation project(':code:libraries:braille-block-punch-cards')
testImplementation project(':code:libraries:test-helpers')
}

View File

@@ -1,38 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:processes:converting-process:model')
implementation libs.bundles.slf4j
implementation libs.prometheus
implementation libs.roaringbitmap
implementation libs.fastutil
implementation libs.trove
implementation libs.slop
testImplementation project(':code:libraries:test-helpers')
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@@ -1,33 +0,0 @@
package nu.marginalia.index.forward;
import java.nio.file.Path;
public class ForwardIndexFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case DOC_ID -> switch (version) {
case NEXT -> basePath.resolve("fwd-doc-id.dat.next");
case CURRENT -> basePath.resolve("fwd-doc-id.dat");
};
case DOC_DATA -> switch (version) {
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
};
case SPANS_DATA -> switch (version) {
case NEXT -> basePath.resolve("fwd-spans.dat.next");
case CURRENT -> basePath.resolve("fwd-spans.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT
}
public enum FileIdentifier {
DOC_DATA,
SPANS_DATA,
DOC_ID
}
}

View File

@@ -1,59 +0,0 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.sequence.VarintCodedSequence;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
@SuppressWarnings("preview")
public class ForwardIndexSpansReader implements AutoCloseable {
private final FileChannel spansFileChannel;
public ForwardIndexSpansReader(Path spansFile) throws IOException {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
}
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
// Decode the size and offset from the encoded offset
long size = SpansCodec.decodeSize(encodedOffset);
long offset = SpansCodec.decodeStartOffset(encodedOffset);
// Allocate a buffer from the arena
var buffer = arena.allocate(size).asByteBuffer();
buffer.clear();
while (buffer.hasRemaining()) {
spansFileChannel.read(buffer, offset + buffer.position());
}
buffer.flip();
// Read the number of spans in the document
int count = buffer.get();
DocumentSpans ret = new DocumentSpans();
// Decode each span
while (count-- > 0) {
byte code = buffer.get();
short len = buffer.getShort();
ByteBuffer data = buffer.slice(buffer.position(), len);
ret.accept(code, new VarintCodedSequence(data));
// Reset the buffer position to the end of the span
buffer.position(buffer.position() + len);
}
return ret;
}
@Override
public void close() throws IOException {
spansFileChannel.close();
}
}

View File

@@ -1,52 +0,0 @@
package nu.marginalia.index.forward.spans;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class ForwardIndexSpansWriter implements AutoCloseable {
private final FileChannel outputChannel;
private final ByteBuffer work = ByteBuffer.allocate(32);
private long stateStartOffset = -1;
private int stateLength = -1;
public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException {
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
}
public void beginRecord(int count) throws IOException {
stateStartOffset = outputChannel.position();
stateLength = 0;
work.clear();
work.put((byte) count);
work.flip();
while (work.hasRemaining())
stateLength += outputChannel.write(work);
}
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
work.clear();
work.put(spanCode);
work.putShort((short) sequenceData.remaining());
work.flip();
while (work.hasRemaining() || sequenceData.hasRemaining()) {
stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData});
}
}
public long endRecord() {
return SpansCodec.encode(stateStartOffset, stateLength);
}
@Override
public void close() throws IOException {
outputChannel.close();
}
}

View File

@@ -1,17 +0,0 @@
package nu.marginalia.index.forward.spans;
public class SpansCodec {
public static long encode(long startOffset, long size) {
assert size < 0x1000_0000L : "Size must be less than 2^28";
return startOffset << 28 | (size & 0xFFF_FFFFL);
}
public static long decodeStartOffset(long encoded) {
return encoded >>> 28;
}
public static long decodeSize(long encoded) {
return encoded & 0x0FFF_FFFFL;
}
}

View File

@@ -1,21 +0,0 @@
# Forward Index
The forward index contains a mapping from document id to various forms of document metadata.
In practice, the forward index consists of two files, an `id` file and a `data` file.
The `id` file contains a list of sorted document ids, and the `data` file contains
metadata for each document id, in the same order as the `id` file, with a fixed
size record containing data associated with each document id.
Each record contains a binary encoded [DocumentMetadata](../../common/model/java/nu/marginalia/model/idx/DocumentMetadata.java) object,
as well as a [HtmlFeatures](../../common/model/java/nu/marginalia/model/crawl/HtmlFeature.java) bitmask.
Unlike the reverse index, the forward index is not split into two tiers, and the data is in the same
order as it is in the source data, and the cardinality of the document IDs is assumed to fit in memory,
so it's relatively easy to construct.
## Central Classes
* [ForwardIndexConverter](java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java) constructs the index.
* [ForwardIndexReader](java/nu/marginalia/index/forward/ForwardIndexReader.java) interrogates the index.

View File

@@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:array')
implementation project(':code:common:model')
implementation project(':code:processes:converting-process:model')

View File

@@ -2,11 +2,10 @@ package nu.marginalia.index.journal;
import nu.marginalia.slop.SlopTable;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.*;
public record IndexJournal(Path journalDir) {
@@ -47,4 +46,21 @@ public record IndexJournal(Path journalDir) {
return instances;
}
public Set<String> languages() {
try {
Set<String> languages = new HashSet<>(languages());
for (var instance : pages()) {
try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) {
languages.addAll(instance.openLanguageIsoCode(slopTable).getDictionary());
}
}
return languages;
}
catch (IOException ex) {
throw new RuntimeException("Failed to read langauges from index journal");
}
}
}

View File

@@ -6,17 +6,22 @@ import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
public record IndexJournalPage(Path baseDir, int page) {
public static IntColumn features = new IntColumn("features", StorageType.PLAIN);
public static IntColumn size = new IntColumn("size", StorageType.PLAIN);
public static LongColumn combinedId = new LongColumn("combinedId", StorageType.PLAIN);
public static LongColumn documentMeta = new LongColumn("documentMeta", StorageType.PLAIN);
public static EnumColumn languageIsoCode = new EnumColumn("languageIsoCode", StandardCharsets.US_ASCII, StorageType.PLAIN);
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
public static VarintCodedSequenceArrayColumn positions = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
@@ -24,6 +29,7 @@ public record IndexJournalPage(Path baseDir, int page) {
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
public static VarintCodedSequenceArrayColumn spans = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public IndexJournalPage {
if (!baseDir.toFile().isDirectory()) {
throw new IllegalArgumentException("Invalid base directory: " + baseDir);
@@ -46,6 +52,9 @@ public record IndexJournalPage(Path baseDir, int page) {
return size.open(table);
}
public EnumColumn.Reader openLanguageIsoCode(SlopTable table) throws IOException {
return languageIsoCode.open(table);
}
public LongArrayColumn.Reader openTermIds(SlopTable table) throws IOException {
return termIds.open(table);

View File

@@ -1,6 +1,6 @@
package nu.marginalia.index.journal;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable;
@@ -8,6 +8,7 @@ import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import java.io.IOException;
import java.nio.file.Files;
@@ -27,8 +28,7 @@ public class IndexJournalSlopWriter extends SlopTable {
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
private final ByteArrayColumn.Writer spanCodesWriter;
private static final MurmurHash3_128 hash = new MurmurHash3_128();
private final EnumColumn.Writer languagesWriter;
public IndexJournalSlopWriter(Path dir, int page) throws IOException {
@@ -50,14 +50,17 @@ public class IndexJournalSlopWriter extends SlopTable {
spanCodesWriter = IndexJournalPage.spanCodes.create(this);
spansWriter = IndexJournalPage.spans.create(this);
languagesWriter = IndexJournalPage.languageIsoCode.create(this);
}
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) throws IOException {
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection, KeywordHasher hasher) throws IOException {
combinedIdWriter.put(combinedId);
featuresWriter.put(keywordsProjection.htmlFeatures());
sizeWriter.put(keywordsProjection.length());
documentMetaWriter.put(keywordsProjection.documentMetadata());
languagesWriter.put(keywordsProjection.languageIsoCode());
// -- write keyword data --
@@ -66,7 +69,7 @@ public class IndexJournalSlopWriter extends SlopTable {
// termIds are the special hashes of the keywords
long[] termIds = new long[keywordsProjection.words().size()];
for (int i = 0; i < termIds.length; i++) {
termIds[i] = hash.hashKeyword(keywords.get(i));
termIds[i] = hasher.hashKeyword(keywords.get(i));
}
termIdsWriter.put(termIds);
@@ -87,6 +90,7 @@ public class IndexJournalSlopWriter extends SlopTable {
termIdsWriter.close();
termMetadataWriter.close();
termPositionsWriter.close();
languagesWriter.close();
spansWriter.close();
spanCodesWriter.close();
}

View File

@@ -0,0 +1,51 @@
plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
application {
mainClass = 'nu.marginalia.index.perftest.PerfTestMain'
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:db')
implementation project(':code:libraries:array')
implementation project(':code:libraries:native')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:language-processing')
implementation project(':code:common:linkdb')
implementation project(':code:index')
implementation project(':third-party:commons-codec')
implementation project(':code:functions:search-query')
implementation project(':code:functions:search-query:api')
implementation libs.slop
implementation libs.roaringbitmap
implementation libs.bundles.slf4j
implementation libs.guava
libs.bundles.grpc.get().each {
implementation dependencies.create(it) {
exclude group: 'com.google.guava'
}
}
implementation libs.notnull
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.gson
implementation libs.bundles.mariadb
}

View File

@@ -0,0 +1,262 @@
package nu.marginalia.index.perftest;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.uring.UringFileReader;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.stream.LongStream;
public class IoPatternsMain {
static void testBuffered(int sz, int small, int large, int iters) {
try {
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
long fileSize = Files.size(largeFile);
Random r = new Random();
List<MemorySegment> segments = new ArrayList<>();
for (int i = 0; i < sz; i++) {
if (small == large) {
segments.add(Arena.ofAuto().allocate(small));
}
else {
segments.add(Arena.ofAuto().allocate(r.nextInt(small, large)));
}
}
List<Long> offsets = new ArrayList<>();
long[] samples = new long[1000];
int si = 0;
try (UringFileReader reader = new UringFileReader(largeFile, false)) {
for (int iter = 0; iter < iters; ) {
if (si == samples.length) {
Arrays.sort(samples);
double p1 = samples[10] / 1_000.;
double p10 = samples[100] / 1_000.;
double p90 = samples[900] / 1_000.;
double p99 = samples[990] / 1_000.;
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
System.out.println("B"+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
si = 0;
iter++;
}
offsets.clear();
for (int i = 0; i < sz; i++) {
offsets.add(r.nextLong(0, fileSize - 256));
}
long st = System.nanoTime();
reader.read(segments, offsets);
long et = System.nanoTime();
samples[si++] = et - st;
}
}
}
catch (IOException e) {
e.printStackTrace();
}
}
static void testBufferedPread(int sz, int iters) {
try {
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
long fileSize = Files.size(largeFile);
Random r = new Random();
List<MemorySegment> segments = new ArrayList<>();
for (int i = 0; i < sz; i++) {
segments.add(Arena.ofAuto().allocate(r.nextInt(24, 256)));
}
List<Long> offsets = new ArrayList<>();
long[] samples = new long[1000];
int si = 0;
int fd = -1;
try {
fd = LinuxSystemCalls.openBuffered(largeFile);
LinuxSystemCalls.fadviseRandom(fd);
for (int iter = 0; iter < iters; ) {
if (si == samples.length) {
Arrays.sort(samples);
double p1 = samples[10] / 1_000.;
double p10 = samples[100] / 1_000.;
double p90 = samples[900] / 1_000.;
double p99 = samples[990] / 1_000.;
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
System.out.println("BP"+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
si = 0;
iter++;
}
offsets.clear();
for (int i = 0; i < sz; i++) {
offsets.add(r.nextLong(0, fileSize - 256));
}
long st = System.nanoTime();
for (int i = 0; i < sz; i++) {
LinuxSystemCalls.readAt(fd, segments.get(i), offsets.get(i));
}
long et = System.nanoTime();
samples[si++] = et - st;
}
}
finally {
LinuxSystemCalls.closeFd(fd);
}
}
catch (IOException e) {
e.printStackTrace();
}
}
static void testDirect(int blockSize, int sz, int iters) {
try {
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
int fileSizeBlocks = (int) ((Files.size(largeFile) & -blockSize) / blockSize);
Random r = new Random();
List<MemorySegment> segments = new ArrayList<>();
for (int i = 0; i < sz; i++) {
segments.add(Arena.ofAuto().allocate(blockSize, blockSize));
}
List<Long> offsets = new ArrayList<>();
long[] samples = new long[1000];
int si = 0;
try (UringFileReader reader = new UringFileReader(largeFile, true)) {
for (int iter = 0; iter < iters; ) {
if (si == samples.length) {
Arrays.sort(samples);
double p1 = samples[10] / 1_000.;
double p10 = samples[100] / 1_000.;
double p90 = samples[900] / 1_000.;
double p99 = samples[990] / 1_000.;
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
System.out.println("DN"+blockSize+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
si = 0;
iters++;
}
offsets.clear();
for (int i = 0; i < sz; i++) {
offsets.add(blockSize * r.nextLong(0, fileSizeBlocks));
}
long st = System.nanoTime();
reader.read(segments, offsets);
long et = System.nanoTime();
samples[si++] = et - st;
}
}
}
catch (IOException e) {
e.printStackTrace();
}
}
static void testDirect1(int blockSize, int iters) {
try {
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
int fileSizeBlocks = (int) ((Files.size(largeFile) & -blockSize) / blockSize);
Random r = new Random();
MemorySegment segment = Arena.global().allocate(blockSize, blockSize);
long[] samples = new long[1000];
int si = 0;
int fd = LinuxSystemCalls.openDirect(largeFile);
if (fd < 0) {
throw new IOException("open failed");
}
try {
for (int iter = 0; iter < iters; ) {
if (si == samples.length) {
Arrays.sort(samples);
double p1 = samples[10] / 1_000.;
double p10 = samples[100] / 1_000.;
double p90 = samples[900] / 1_000.;
double p99 = samples[990] / 1_000.;
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
System.out.println("D1"+blockSize+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
si = 0;
iters++;
}
long st = System.nanoTime();
int ret;
long readOffset = blockSize * r.nextLong(0, fileSizeBlocks);
if (blockSize != (ret = LinuxSystemCalls.readAt(fd, segment, readOffset))) {
throw new IOException("pread failed: " + ret);
}
long et = System.nanoTime();
samples[si++] = et - st;
}
}
finally {
LinuxSystemCalls.closeFd(fd);
}
}
catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
// Thread.ofPlatform().start(() -> testBuffered(128, 32, 65536,1000));
Thread.ofPlatform().start(() -> testDirect(8192*4, 128,1000));
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
// Thread.ofPlatform().start(() -> testBufferedPread(128, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024*1024, 1000));
// Thread.ofPlatform().start(() -> testDirect1(1024*1024, 1000));
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
// Thread.ofPlatform().start(() -> testDirect(512, 100));
// Thread.ofPlatform().start(() -> testDirect(512, 100));
// Thread.ofPlatform().start(() -> testDirect(512, 100));
// Thread.ofPlatform().start(() -> testDirect(512, 100));
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
// Thread.ofPlatform().start(() -> testBuffered(100));
// Thread.ofPlatform().start(() -> testBuffered(100));
for (;;);
// testBuffered(100);
}
}

View File

@@ -0,0 +1,307 @@
package nu.marginalia.index.perftest;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.index.CombinedIndexReader;
import nu.marginalia.index.IndexQueryExecution;
import nu.marginalia.index.StatefulIndex;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.results.DomainRankingOverrides;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.reverse.FullReverseIndexReader;
import nu.marginalia.index.reverse.PrioReverseIndexReader;
import nu.marginalia.index.reverse.WordLexicon;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.searchset.SearchSetAny;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.SQLException;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeoutException;
public class PerfTestMain {
static Duration warmupTime = Duration.ofMinutes(1);
static Duration runTime = Duration.ofMinutes(10);
public static void main(String[] args) {
if (args.length != 4) {
System.err.println("Arguments: home-dir index-dir query");
System.exit(255);
}
try {
Path indexDir = Paths.get(args[0]);
if (!Files.isDirectory(indexDir)) {
System.err.println("Index directory is not a directory");
System.exit(255);
}
Path homeDir = Paths.get(args[1]);
String scenario = args[2];
String query = args[3];
switch (scenario) {
case "valuation" -> runValuation(indexDir, homeDir, query);
case "lookup" -> runLookup(indexDir, homeDir, query);
case "execution" -> runExecution(indexDir, homeDir, query);
}
System.exit(0);
}
catch (NumberFormatException e) {
System.err.println("Arguments: data-dir index-dir query");
System.exit(255);
}
catch (Exception ex) {
System.err.println("Error during testing");
ex.printStackTrace();
System.exit(255);
}
System.out.println(Arrays.toString(args));
}
private static CombinedIndexReader createCombinedIndexReader(Path indexDir) throws IOException {
return new CombinedIndexReader(
new ForwardIndexReader(
indexDir.resolve("ir/fwd-doc-id.dat"),
indexDir.resolve("ir/fwd-doc-data.dat"),
indexDir.resolve("ir/fwd-spans.dat")
),
new FullReverseIndexReader(
"full",
List.of(new WordLexicon("en", indexDir.resolve("ir/rev-words-en.dat"))),
indexDir.resolve("ir/rev-docs.dat"),
indexDir.resolve("ir/rev-positions.dat")
),
new PrioReverseIndexReader(
"prio",
List.of(new WordLexicon("en", indexDir.resolve("ir/rev-words-prio-en.dat"))),
indexDir.resolve("ir/rev-prio-docs.dat")
)
);
}
private static IndexResultRankingService createIndexResultRankingService(Path indexDir, CombinedIndexReader combinedIndexReader) throws IOException, SQLException {
return new IndexResultRankingService(
new DocumentDbReader(indexDir.resolve("ldbr/documents.db")),
new StatefulIndex(combinedIndexReader),
new DomainRankingOverrides(null, Path.of("xxxx"))
);
}
static QueryFactory createQueryFactory(Path homeDir) throws IOException {
return new QueryFactory(
new QueryExpansion(
new TermFrequencyDict(homeDir.resolve("model/tfreq-new-algo3.bin")),
new NgramLexicon()
)
);
}
public static void runValuation(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException, TimeoutException {
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(10_000)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF, "en"), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
var rankingContext = SearchContext.create(indexReader, new KeywordHasher.AsciiIsh(), parsedQuery, new SearchSetAny());
List<IndexQuery> queries = indexReader.createQueries(rankingContext);
TLongArrayList allResults = new TLongArrayList();
LongQueryBuffer buffer = new LongQueryBuffer(512);
for (var query : queries) {
while (query.hasMore() && allResults.size() < 512 ) {
query.getMoreResults(buffer);
allResults.addAll(buffer.copyData());
}
if (allResults.size() >= 512)
break;
}
allResults.sort();
if (allResults.size() > 512) {
allResults.subList(512, allResults.size()).clear();
}
var rankingData = rankingService.prepareRankingData(rankingContext, new CombinedDocIdList(allResults.toArray()));
int sum = 0;
Instant runEndTime = Instant.now().plus(runTime);
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> times = new ArrayList<>();
int iter;
for (iter = 0;; iter++) {
long start = System.nanoTime();
sum2 += rankingService.rankResults(rankingContext, rankingData, false).size();
long end = System.nanoTime();
times.add((end - start)/1_000_000.);
if ((iter % 100) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
if (times.size() > 100) {
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + (allResults.size() / 512.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best times: " + (allResults.size() / 512.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum);
System.out.println("Main sum: " + sum2);
System.out.println(rankingData.size());
}
public static void runExecution(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException, InterruptedException {
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(50)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF, "en"), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
System.out.println("Running warmup loop!");
int sum = 0;
Instant runEndTime = Instant.now().plus(runTime);
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> rates = new ArrayList<>();
List<Double> times = new ArrayList<>();
int iter;
for (iter = 0;; iter++) {
var execution = new IndexQueryExecution(indexReader, rankingService, SearchContext.create(indexReader, new KeywordHasher.AsciiIsh(), parsedQuery, new SearchSetAny()), 1);
long start = System.nanoTime();
execution.run();
long end = System.nanoTime();
sum2 += execution.itemsProcessed();
rates.add(execution.itemsProcessed() / ((end - start)/1_000_000_000.));
times.add((end - start)/1_000_000.);
indexReader.reset();
if ((iter % 100) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
if (times.size() > 100) {
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best rates: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best counts: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum);
System.out.println("Main sum: " + sum2);
}
public static void runLookup(Path homeDir,
Path indexDir,
String rawQuery) throws IOException, SQLException
{
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
QueryFactory queryFactory = createQueryFactory(homeDir);
var queryLimits = RpcQueryLimits.newBuilder()
.setTimeoutMs(10_000)
.setResultsTotal(1000)
.setResultsByDomain(10)
.setFetchSize(4096)
.build();
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF, "en"), PrototypeRankingParameters.sensibleDefaults()).specs;
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
SearchContext searchContext = SearchContext.create(indexReader, new KeywordHasher.AsciiIsh(), parsedQuery, new SearchSetAny());
Instant runEndTime = Instant.now().plus(runTime);
LongQueryBuffer buffer = new LongQueryBuffer(512);
int sum1 = 0;
int iter;
Instant runStartTime = Instant.now();
int sum2 = 0;
List<Double> times = new ArrayList<>();
for (iter = 0;; iter++) {
indexReader.reset();
List<IndexQuery> queries = indexReader.createQueries(searchContext);
long start = System.nanoTime();
for (var query : queries) {
while (query.hasMore()) {
query.getMoreResults(buffer);
sum1 += buffer.end;
buffer.reset();
}
}
long end = System.nanoTime();
times.add((end - start)/1_000_000_000.);
if ((iter % 10) == 0) {
if (Instant.now().isAfter(runEndTime)) {
break;
}
if (times.size() > 100) {
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
}
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
}
}
System.out.println("Benchmark complete after " + iter + " iters!");
System.out.println("Best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
System.out.println("Warmup sum: " + sum1);
System.out.println("Main sum: " + sum2);
}
}

View File

@@ -1,41 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:random-write-funnel')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:processes:converting-process:model')
implementation project(':third-party:parquet-floor')
implementation project(':third-party:commons-codec')
implementation libs.bundles.slf4j
implementation libs.slop
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:libraries:test-helpers')
}

View File

@@ -1,69 +0,0 @@
package nu.marginalia.index;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EntrySource;
import static java.lang.Math.min;
public class FullIndexEntrySource implements EntrySource {
private final String name;
private final BTreeReader reader;
int pos;
int endOffset;
final int entrySize;
private final long wordId;
public FullIndexEntrySource(String name,
BTreeReader reader,
int entrySize,
long wordId) {
this.name = name;
this.reader = reader;
this.entrySize = entrySize;
this.wordId = wordId;
pos = 0;
endOffset = pos + entrySize * reader.numEntries();
}
@Override
public void skip(int n) {
pos += n;
}
@Override
public void read(LongQueryBuffer buffer) {
buffer.reset();
buffer.end = min(buffer.end, endOffset - pos);
reader.readData(buffer.data, buffer.end, pos);
pos += buffer.end;
destagger(buffer);
buffer.uniq();
}
private void destagger(LongQueryBuffer buffer) {
if (entrySize == 1)
return;
for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
buffer.data.set(wi, buffer.data.get(ri));
}
buffer.end /= entrySize;
}
@Override
public boolean hasMore() {
return pos < endOffset;
}
@Override
public String indexName() {
return name + ":" + Long.toHexString(wordId);
}
}

View File

@@ -1,188 +0,0 @@
package nu.marginalia.index;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.query.ReverseIndexRejectFilter;
import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
import nu.marginalia.index.query.filter.QueryFilterNoPass;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.Executors;
public class FullReverseIndexReader {
private final LongArray words;
private final LongArray documents;
private final long wordsDataOffset;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final BTreeReader wordsBTreeReader;
private final String name;
private final PositionsFileReader positionsFileReader;
public FullReverseIndexReader(String name,
Path words,
Path documents,
PositionsFileReader positionsFileReader) throws IOException {
this.name = name;
this.positionsFileReader = positionsFileReader;
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.documents = null;
this.wordsBTreeReader = null;
this.wordsDataOffset = -1;
return;
}
logger.info("Switching reverse index");
this.words = LongArrayFactory.mmapForReadingShared(words);
this.documents = LongArrayFactory.mmapForReadingShared(documents);
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
if (getClass().desiredAssertionStatus()) {
if (Boolean.getBoolean("index-self-test")) {
Executors.newSingleThreadExecutor().execute(this::selfTest);
}
}
}
private void selfTest() {
logger.info("Running self test program");
long wordsDataSize = wordsBTreeReader.getHeader().numEntries() * 2L;
var wordsDataRange = words.range(wordsDataOffset, wordsDataOffset + wordsDataSize);
// ReverseIndexSelfTest.runSelfTest1(wordsDataRange, wordsDataSize);
// ReverseIndexSelfTest.runSelfTest2(wordsDataRange, documents);
// ReverseIndexSelfTest.runSelfTest3(wordsDataRange, wordsBTreeReader);
// ReverseIndexSelfTest.runSelfTest4(wordsDataRange, documents);
ReverseIndexSelfTest.runSelfTest5(wordsDataRange, wordsBTreeReader);
ReverseIndexSelfTest.runSelfTest6(wordsDataRange, documents);
}
/** Calculate the offset of the word in the documents.
* If the return-value is negative, the term does not exist
* in the index.
*/
long wordOffset(long termId) {
long idx = wordsBTreeReader.findEntry(termId);
if (idx < 0)
return -1L;
return words.get(wordsDataOffset + idx + 1);
}
public EntrySource documents(long termId) {
if (null == words) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new EmptyEntrySource();
return new FullIndexEntrySource(name, createReaderNew(offset), 2, termId);
}
/** Create a filter step requiring the specified termId to exist in the documents */
public QueryFilterStepIf also(long termId) {
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new QueryFilterNoPass();
return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId);
}
/** Create a filter step requiring the specified termId to be absent from the documents */
public QueryFilterStepIf not(long termId) {
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new QueryFilterLetThrough();
return new ReverseIndexRejectFilter(createReaderNew(offset));
}
/** Return the number of documents with the termId in the index */
public int numDocuments(long termId) {
long offset = wordOffset(termId);
if (offset < 0)
return 0;
return createReaderNew(offset).numEntries();
}
/** Create a BTreeReader for the document offset associated with a termId */
private BTreeReader createReaderNew(long offset) {
return new BTreeReader(
documents,
ReverseIndexParameters.fullDocsBTreeContext,
offset);
}
public TermData[] getTermData(Arena arena,
long termId,
long[] docIds)
{
var ret = new TermData[docIds.length];
long offset = wordOffset(termId);
if (offset < 0) {
// This is likely a bug in the code, but we can't throw an exception here
logger.debug("Missing offset for word {}", termId);
return ret;
}
var reader = createReaderNew(offset);
// Read the size and offset of the position data
var offsets = reader.queryData(docIds, 1);
for (int i = 0; i < docIds.length; i++) {
if (offsets[i] == 0)
continue;
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
}
return ret;
}
public void close() {
if (documents != null)
documents.close();
if (words != null)
words.close();
if (positionsFileReader != null) {
try {
positionsFileReader.close();
} catch (IOException e) {
logger.error("Failed to close positions file reader", e);
}
}
}
}

View File

@@ -1,33 +0,0 @@
package nu.marginalia.index;
import java.nio.file.Path;
public class ReverseIndexFullFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case WORDS -> switch (version) {
case NEXT -> basePath.resolve("rev-words.dat.next");
case CURRENT -> basePath.resolve("rev-words.dat");
};
case DOCS -> switch (version) {
case NEXT -> basePath.resolve("rev-docs.dat.next");
case CURRENT -> basePath.resolve("rev-docs.dat");
};
case POSITIONS -> switch (version) {
case NEXT -> basePath.resolve("rev-positions.dat.next");
case CURRENT -> basePath.resolve("rev-positions.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT,
}
public enum FileIdentifier {
WORDS,
DOCS,
POSITIONS,
}
}

View File

@@ -1,11 +0,0 @@
package nu.marginalia.index;
import nu.marginalia.btree.model.BTreeBlockSize;
import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexParameters
{
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_2048);
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
}

View File

@@ -1,28 +0,0 @@
package nu.marginalia.index;
import java.nio.file.Path;
public class ReverseIndexPrioFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case WORDS -> switch (version) {
case NEXT -> basePath.resolve("rev-prio-words.dat.next");
case CURRENT -> basePath.resolve("rev-prio-words.dat");
};
case DOCS -> switch (version) {
case NEXT -> basePath.resolve("rev-prio-docs.dat.next");
case CURRENT -> basePath.resolve("rev-prio-docs.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT
}
public enum FileIdentifier {
WORDS,
DOCS,
}
}

View File

@@ -1,109 +0,0 @@
package nu.marginalia.index;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Random;
public class ReverseIndexSelfTest {
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexSelfTest.class);
public static void runSelfTest1(LongArray wordsDataRange, long wordsDataSize) {
logger.info("Starting test 1");
if (!wordsDataRange.isSortedN(2, 0, wordsDataSize))
logger.error("Failed test 1: Words data is not sorted");
else
logger.info("Passed test 1");
}
public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 2");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
if (!docRange.isSortedN(2, 0, header.numEntries() * 2L)) {
logger.error("Failed test 2: numEntries={}, offset={}", header.numEntries(), header.dataOffsetLongs());
return;
}
}
logger.info("Passed test 2");
}
public static void runSelfTest3(LongArray wordsDataRange, BTreeReader reader) {
logger.info("Starting test 3");
for (long i = 0; i < wordsDataRange.size(); i+=2) {
if (reader.findEntry(wordsDataRange.get(i)) < 0) {
logger.error("Failed Test 3");
return;
}
}
logger.info("Passed test 3");
}
public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 4");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
for (int j = 0; j < docRange.size(); j+=2) {
if (docsBTreeReader.findEntry(docRange.get(j)) < 0) {
logger.info("Failed test 4");
return;
}
}
}
logger.info("Passed test 4");
}
public static void runSelfTest5(LongArray wordsDataRange, BTreeReader wordsBTreeReader) {
logger.info("Starting test 5");
LongOpenHashSet words = new LongOpenHashSet((int)wordsDataRange.size()/2);
for (int i = 0; i < wordsDataRange.size(); i+=2) {
words.add(wordsDataRange.get(i));
}
var random = new Random();
for (int i = 0; i < 100_000_000; i++) {
long v;
do {
v = random.nextLong();
} while (words.contains(v));
if (wordsBTreeReader.findEntry(v) >= 0) {
logger.error("Failed test 5 @ W{}", v);
return;
}
}
logger.info("Passed test 5");
}
public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 6");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
Long prev = null;
for (int j = 0; j < docRange.size(); j+=2) {
if (prev == null) {
prev = docRange.get(j);
continue;
}
long thisVal = prev + 1;
long nextVal = docRange.get(j);
while (thisVal < nextVal) {
if (docsBTreeReader.findEntry(thisVal) >= 0) {
logger.info("Failed test 6 @ W{}:D{}", wordsDataRange.get(i-1), thisVal);
return;
}
thisVal++;
}
}
}
logger.info("Passed test 6");
}
}

View File

@@ -1,76 +0,0 @@
package nu.marginalia.index.construction;
import nu.marginalia.index.positions.PositionCodec;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** A class for constructing a positions file. This class is thread-safe.
*
* <p></p>
*
* The positions data is concatenated in the file, with each term's metadata
* followed by its positions. The metadata is a single byte, and the positions
* are encoded using the Elias Gamma code, with zero padded bits at the end to
* get octet alignment.
*
* <p></p>
*
* It is the responsibility of the caller to keep track of the byte offset of
* each posting in the file.
*/
public class PositionsFileConstructor implements AutoCloseable {
private final ByteBuffer workBuffer = ByteBuffer.allocate(65536);
private final Path file;
private final FileChannel channel;
private long offset;
public PositionsFileConstructor(Path file) throws IOException {
this.file = file;
channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
}
/** Add a term to the positions file
* @param termMeta the term metadata
* @param positionsBuffer the positions of the term
* @return the offset of the term in the file, with the size of the data in the highest byte
*/
public long add(byte termMeta, ByteBuffer positionsBuffer) throws IOException {
synchronized (file) {
int size = 1 + positionsBuffer.remaining();
if (workBuffer.remaining() < size) {
workBuffer.flip();
channel.write(workBuffer);
workBuffer.clear();
}
workBuffer.put(termMeta);
workBuffer.put(positionsBuffer);
long ret = PositionCodec.encode(size, offset);
offset += size;
return ret;
}
}
public void close() throws IOException {
if (workBuffer.hasRemaining()) {
workBuffer.flip();
while (workBuffer.hasRemaining())
channel.write(workBuffer);
}
channel.force(false);
channel.close();
}
}

View File

@@ -1,46 +0,0 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.LongArrayTransformations;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.btree.model.BTreeContext;
import java.io.IOException;
/** Constructs the BTrees in a reverse index */
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
private final BTreeWriter writer;
private final int entrySize;
private final LongArray documentsArray;
long start = 0;
long writeOffset = 0;
public FullIndexBTreeTransformer(LongArray urlsFileMap,
int entrySize,
BTreeContext bTreeContext,
LongArray documentsArray) {
this.documentsArray = documentsArray;
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
this.entrySize = entrySize;
}
@Override
public long transform(long pos, long end) throws IOException {
final int size = (int) ((end - start) / entrySize);
if (size == 0) {
return -1;
}
final long offsetForBlock = writeOffset;
writeOffset += writer.write(writeOffset, size,
mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start)
);
start = end;
return offsetForBlock;
}
}

View File

@@ -1,43 +0,0 @@
package nu.marginalia.index.positions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class PositionsFileReader implements AutoCloseable {
private final FileChannel positions;
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
public PositionsFileReader(Path positionsFile) throws IOException {
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
}
/** Get the positions for a term in the index, as pointed out by the encoded offset;
* intermediate buffers are allocated from the provided arena allocator. */
public TermData getTermData(Arena arena, long sizeEncodedOffset) {
int length = PositionCodec.decodeSize(sizeEncodedOffset);
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
var segment = arena.allocate(length);
var buffer = segment.asByteBuffer();
try {
positions.read(buffer, offset);
} catch (IOException e) {
throw new RuntimeException(e);
}
return new TermData(buffer);
}
@Override
public void close() throws IOException {
positions.close();
}
}

View File

@@ -1,28 +0,0 @@
package nu.marginalia.index.query;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
public record ReverseIndexRejectFilter(BTreeReader range) implements QueryFilterStepIf {
@Override
public void apply(LongQueryBuffer buffer) {
range.rejectEntries(buffer);
buffer.finalizeFiltering();
}
public boolean test(long id) {
return range.findEntry(id) < 0;
}
@Override
public double cost() {
return range.numEntries();
}
@Override
public String describe() {
return "ReverseIndexRejectFilter[]";
}
}

View File

@@ -1,28 +0,0 @@
package nu.marginalia.index.query;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
public record ReverseIndexRetainFilter(BTreeReader range, String name, long wordId) implements QueryFilterStepIf {
@Override
public void apply(LongQueryBuffer buffer) {
range.retainEntries(buffer);
buffer.finalizeFiltering();
}
public boolean test(long id) {
return range.findEntry(id) >= 0;
}
@Override
public double cost() {
return range.numEntries();
}
@Override
public String describe() {
return "Retain:" + name + "/" + wordId;
}
}

View File

@@ -1,56 +0,0 @@
# Reverse Index
The reverse index contains a mapping from word to document id.
There are two tiers of this index.
* A priority index which only indexes terms that are flagged with priority flags<sup>1</sup>.
* A full index that indexes all terms.
The full index also provides access to term-level metadata, while the priority index is
a binary index that only offers information about which documents has a specific word.
The priority index is also compressed, while the full index at this point is not.
[1] See WordFlags in [common/model](../../common/model/) and
KeywordMetadata in [converting-process/ft-keyword-extraction](../../processes/converting-process/ft-keyword-extraction).
## Construction
The reverse index is constructed by first building a series of preindexes.
Preindexes consist of a Segment and a Documents object. The segment contains
information about which word identifiers are present and how many, and the
documents contain information about in which documents the words can be found.
![Memory layout illustrations](./preindex.svg)
These would typically not fit in RAM, so the index journal is paged
and the preindexes are constructed small enough to fit in memory, and
then merged. Merging sorted arrays is a very fast operation that does
not require additional RAM.
![Illustration of successively merged preindex files](./merging.svg)
Once merged into one large preindex, indexes are added to the preindex data
to form a finalized reverse index.
![Illustration of the data layout of the finalized index](index.svg)
## Central Classes
Full index:
* [FullPreindex](java/nu/marginalia/index/construction/full/FullPreindex.java) intermediate reverse index state.
* [FullIndexConstructor](java/nu/marginalia/index/construction/full/FullIndexConstructor.java) constructs the index.
* [FullReverseIndexReader](java/nu/marginalia/index/FullReverseIndexReader.java) interrogates the index.
Prio index:
* [PrioPreindex](java/nu/marginalia/index/construction/prio/PrioPreindex.java) intermediate reverse index state.
* [PrioIndexConstructor](java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java) constructs the index.
* [PrioIndexReader](java/nu/marginalia/index/PrioReverseIndexReader.java) interrogates the index.
## See Also
* [index-journal](../index-journal)
* [index-forward](../index-forward)
* [libraries/btree](../../libraries/btree)
* [libraries/array](../../libraries/array)

View File

@@ -1,63 +0,0 @@
package nu.marginalia.index;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.sequence.VarintCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertEquals;
class PositionsFileReaderTest {
Path file;
@BeforeEach
void setUp() throws IOException {
file = Files.createTempFile("positions", "dat");
}
@AfterEach
void tearDown() throws IOException {
Files.delete(file);
}
@Test
void getTermData() throws IOException {
ByteBuffer workArea = ByteBuffer.allocate(8192);
long key1, key2, key3;
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
key1 = constructor.add((byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
key2 = constructor.add((byte) 51, VarintCodedSequence.generate(2, 3, 5, 1000, 5000, 20241).buffer());
key3 = constructor.add((byte) 61, VarintCodedSequence.generate(3, 5, 7).buffer());
}
System.out.println("key1: " + Long.toHexString(key1));
System.out.println("key2: " + Long.toHexString(key2));
System.out.println("key3: " + Long.toHexString(key3));
try (Arena arena = Arena.ofConfined();
PositionsFileReader reader = new PositionsFileReader(file))
{
TermData data1 = reader.getTermData(arena, key1);
assertEquals(43, data1.flags());
assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
TermData data2 = reader.getTermData(arena, key2);
assertEquals(51, data2.flags());
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
TermData data3 = reader.getTermData(arena, key3);
assertEquals(61, data3.flags());
assertEquals(IntList.of(3, 5, 7), data3.positions().values());
}
}
}

View File

@@ -1,49 +0,0 @@
package nu.marginalia.index;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.*;
public class ReverseIndexDebugTest {
@Test
@Disabled // this is a debugging utility
public void debug() throws IOException {
long problemWord = -7909917549851025932L;
long problemDoc = 9079256848846028801L;
var words = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-words.dat"));
var documents = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-docs.dat"));
var wordsBTreeReader = new BTreeReader(words, ReverseIndexParameters.wordsBTreeContext, 0);
var wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
long wordOffset = wordsBTreeReader.findEntry(problemWord);
assertTrue(wordOffset >= 0);
var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset);
// We find problemDoc even though it doesn't exist in the document range
long docOffset = docsReader.findEntry(problemDoc);
assertTrue(docOffset < 0);
// We know it doesn't exist because when we check, we can't find it,
// either by iterating...
var dataRange = docsReader.data();
System.out.println(dataRange.size());
for (int i = 0; i < dataRange.size(); i+=2) {
assertNotEquals(problemDoc, dataRange.get(i));
}
// or by binary searching
assertTrue(dataRange.binarySearchN(2, problemDoc, 0, dataRange.size()) < 0);
}
}

View File

@@ -1,149 +0,0 @@
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.model.BTreeHeader;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class FullPreindexFinalizeTest {
TestJournalFactory journalFactory;
Path positionsFile;
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
positionsFile = Files.createTempFile("positions", ".dat");
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
MurmurHash3_128 hash = new MurmurHash3_128();
long termId(String keyword) {
return hash.hashKeyword(keyword);
}
@Test
public void testFinalizeSimple() throws IOException {
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
var preindex = FullPreindex.constructPreindex(reader,
new PositionsFileConstructor(positionsFile),
DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
preindex.delete();
Path wordsFile = tempDir.resolve("words.dat");
Path docsFile = tempDir.resolve("docs.dat");
assertTrue(Files.exists(wordsFile));
assertTrue(Files.exists(docsFile));
System.out.println(Files.size(wordsFile));
System.out.println(Files.size(docsFile));
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
var docsHeader = new BTreeHeader(docsArray, 0);
var wordsHeader = new BTreeHeader(wordsArray, 0);
assertEquals(1, docsHeader.numEntries());
assertEquals(1, wordsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
}
@Test
public void testFinalizeSimple2x2() throws IOException {
var reader = journalFactory.createReader(
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
new EntryDataWithWordMeta(101, 101, wm(51, 52))
);
var preindex = FullPreindex.constructPreindex(reader,
new PositionsFileConstructor(positionsFile),
DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
preindex.delete();
Path wordsFile = tempDir.resolve("words.dat");
Path docsFile = tempDir.resolve("docs.dat");
assertTrue(Files.exists(wordsFile));
assertTrue(Files.exists(docsFile));
System.out.println(Files.size(wordsFile));
System.out.println(Files.size(docsFile));
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
var wordsHeader = new BTreeHeader(wordsArray, 0);
System.out.println(wordsHeader);
assertEquals(2, wordsHeader.numEntries());
long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1);
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
BTreeHeader docsHeader;
docsHeader = new BTreeHeader(docsArray, offset1);
System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
docsHeader = new BTreeHeader(docsArray, offset2);
System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries());
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
}
}

View File

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

@@ -1,22 +1,24 @@
package nu.marginalia.index.index;
package nu.marginalia.index;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.model.TermMetadataList;
import nu.marginalia.index.reverse.FullReverseIndexReader;
import nu.marginalia.index.reverse.IndexLanguageContext;
import nu.marginalia.index.reverse.PrioReverseIndexReader;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import org.slf4j.Logger;
@@ -28,6 +30,7 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.function.Predicate;
/** A reader for the combined forward and reverse indexes.
@@ -51,25 +54,33 @@ public class CombinedIndexReader {
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
}
public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) {
return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
public IndexLanguageContext createLanguageContext(String languageIsoCode) {
return new IndexLanguageContext(languageIsoCode,
reverseIndexFullReader.getWordLexicon(languageIsoCode),
reverseIndexPriorityReader.getWordLexicon(languageIsoCode)
);
}
public QueryFilterStepIf hasWordFull(long termId) {
return reverseIndexFullReader.also(termId);
public IndexQueryBuilder newQueryBuilder(IndexLanguageContext context, IndexQuery query) {
return new IndexQueryBuilder(reverseIndexFullReader, context, query);
}
public QueryFilterStepIf hasWordFull(IndexLanguageContext languageContext, long termId, IndexSearchBudget budget) {
return reverseIndexFullReader.also(languageContext, termId, budget);
}
/** Creates a query builder for terms in the priority index */
public IndexQueryBuilder findPriorityWord(long wordId) {
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
.withSourceTerms(wordId);
public IndexQueryBuilder findPriorityWord(IndexLanguageContext languageContext, long wordId) {
IndexQuery query = new IndexQuery(reverseIndexPriorityReader.documents(languageContext, wordId), true);
return newQueryBuilder(languageContext, query).withSourceTerms(wordId);
}
/** Creates a query builder for terms in the full index */
public IndexQueryBuilder findFullWord(long wordId) {
return newQueryBuilder(
new IndexQuery(reverseIndexFullReader.documents(wordId)))
.withSourceTerms(wordId);
public IndexQueryBuilder findFullWord(IndexLanguageContext languageContext, long wordId) {
IndexQuery query = new IndexQuery(reverseIndexFullReader.documents(languageContext, wordId), false);
return newQueryBuilder(languageContext, query).withSourceTerms(wordId);
}
/** Creates a parameter matching filter step for the provided parameters */
@@ -78,21 +89,32 @@ public class CombinedIndexReader {
}
/** Returns the number of occurrences of the word in the full index */
public int numHits(long word) {
return reverseIndexFullReader.numDocuments(word);
public int numHits(IndexLanguageContext languageContext, long word) {
return reverseIndexFullReader.numDocuments(languageContext, word);
}
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
/** Reset caches and buffers */
public void reset() {
reverseIndexFullReader.reset();
}
public List<IndexQuery> createQueries(SearchContext context) {
if (!isLoaded()) {
logger.warn("Index reader not ready");
return Collections.emptyList();
}
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
final IndexLanguageContext languageContext = context.languageContext;
final long[] termPriority = context.sortedDistinctIncludes((a,b) -> {
return Long.compare(
numHits(languageContext, a),
numHits(languageContext, b)
);
});
final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords);
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery());
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(context.compiledQueryIds);
// Remove any paths that do not contain all prioritized terms, as this means
// the term is missing from the index and can never be found
@@ -102,37 +124,27 @@ public class CombinedIndexReader {
LongList elements = new LongArrayList(path);
elements.sort((a, b) -> {
for (int i = 0; i < termPriority.length; i++) {
if (termPriority[i] == a)
for (long l : termPriority) {
if (l == a)
return -1;
if (termPriority[i] == b)
if (l == b)
return 1;
}
return 0;
});
if (!SearchTerms.stopWords.contains(elements.getLong(0))) {
var head = findFullWord(elements.getLong(0));
var head = findFullWord(languageContext, elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
long termId = elements.getLong(i);
// if a stop word is present in the query, skip the step of requiring it to be in the document,
// we'll assume it's there and save IO
if (SearchTerms.stopWords.contains(termId)) {
continue;
}
head.addInclusionFilter(hasWordFull(termId));
}
queryHeads.add(head);
for (int i = 1; i < elements.size(); i++) {
head.addInclusionFilter(hasWordFull(languageContext, elements.getLong(i), context.budget));
}
queryHeads.add(head);
// If there are few paths, we can afford to check the priority index as well
if (paths.size() < 4) {
var prioHead = findPriorityWord(elements.getLong(0));
var prioHead = findPriorityWord(languageContext, elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
prioHead.addInclusionFilter(hasWordFull(elements.getLong(i)));
prioHead.addInclusionFilter(hasWordFull(languageContext, elements.getLong(i), context.budget));
}
queryHeads.add(prioHead);
}
@@ -142,17 +154,17 @@ public class CombinedIndexReader {
for (var query : queryHeads) {
// Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
for (long term : terms.advice()) {
query = query.also(term);
for (long term : context.termIdsAdvice) {
query = query.also(term, context.budget);
}
for (long term : terms.excludes()) {
query = query.not(term);
for (long term : context.termIdsExcludes) {
query = query.not(term, context.budget);
}
// Run these filter steps last, as they'll worst-case cause as many page faults as there are
// items in the buffer
query.addInclusionFilter(filterForParams(params));
query.addInclusionFilter(filterForParams(context.queryParams));
}
return queryHeads
@@ -166,23 +178,20 @@ public class CombinedIndexReader {
return permittedTerms::containsAll;
}
private int compareKeywords(long a, long b) {
return Long.compare(
numHits(a),
numHits(b)
);
}
/** Returns the number of occurrences of the word in the priority index */
public int numHitsPrio(long word) {
return reverseIndexPriorityReader.numDocuments(word);
public int numHitsPrio(IndexLanguageContext languageContext, long word) {
return reverseIndexPriorityReader.numDocuments(languageContext, word);
}
/** Retrieves the term metadata for the specified word for the provided documents */
public TermMetadataList getTermMetadata(Arena arena,
long wordId,
CombinedDocIdList docIds)
public TermMetadataList[] getTermMetadata(Arena arena,
IndexLanguageContext languageContext,
IndexSearchBudget budget,
long[] wordIds,
CombinedDocIdList docIds)
throws TimeoutException
{
return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array()));
return reverseIndexFullReader.getTermData(arena, languageContext, budget, wordIds, docIds);
}
/** Retrieves the document metadata for the specified document */
@@ -205,14 +214,14 @@ public class CombinedIndexReader {
return forwardIndexReader.getDocumentSize(docId);
}
/** Retrieves the document spans for the specified document */
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
return forwardIndexReader.getDocumentSpans(arena, docId);
/** Retrieves the document spans for the specified documents */
public DocumentSpans[] getDocumentSpans(Arena arena, IndexSearchBudget budget, CombinedDocIdList docIds) throws TimeoutException {
return forwardIndexReader.getDocumentSpans(arena, budget, docIds);
}
/** Close the indexes (this is not done immediately)
* */
public void close() throws InterruptedException {
public void close() {
/* Delay the invocation of close method to allow for a clean shutdown of the service.
*
* This is especially important when using Unsafe-based LongArrays, since we have
@@ -227,7 +236,7 @@ public class CombinedIndexReader {
}
private void delayedCall(Runnable call, Duration delay) throws InterruptedException {
private void delayedCall(Runnable call, Duration delay) {
Thread.ofPlatform().start(() -> {
try {
TimeUnit.SECONDS.sleep(delay.toSeconds());
@@ -248,25 +257,47 @@ public class CombinedIndexReader {
class ParamMatchingQueryFilter implements QueryFilterStepIf {
private final QueryParams params;
private final ForwardIndexReader forwardIndexReader;
private final boolean imposesMetaConstraint;
public ParamMatchingQueryFilter(QueryParams params,
ForwardIndexReader forwardIndexReader)
{
this.params = params;
this.forwardIndexReader = forwardIndexReader;
this.imposesMetaConstraint = params.imposesDomainMetadataConstraint();
}
@Override
public void apply(LongQueryBuffer buffer) {
if (!imposesMetaConstraint && !params.searchSet().imposesConstraint()) {
return;
}
while (buffer.hasMore()) {
if (test(buffer.currentValue())) {
buffer.retainAndAdvance();
}
else {
buffer.rejectAndAdvance();
}
}
buffer.finalizeFiltering();
}
public boolean test(long combinedId) {
long docId = UrlIdCodec.removeRank(combinedId);
int domainId = UrlIdCodec.getDomainId(docId);
long meta = forwardIndexReader.getDocMeta(docId);
if (!validateDomain(domainId, meta)) {
if (!validateDomain(domainId)) {
return false;
}
if (!imposesMetaConstraint) {
return true;
}
long meta = forwardIndexReader.getDocMeta(docId);
if (!validateQuality(meta)) {
return false;
}
@@ -286,8 +317,8 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
return true;
}
private boolean validateDomain(int domainId, long meta) {
return params.searchSet().contains(domainId, meta);
private boolean validateDomain(int domainId) {
return params.searchSet().contains(domainId);
}
private boolean validateQuality(long meta) {
@@ -338,4 +369,5 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
public String describe() {
return getClass().getSimpleName();
}
}

View File

@@ -3,16 +3,18 @@ package nu.marginalia.index;
import com.google.inject.Guice;
import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.config.IndexFileName;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.reverse.construction.full.FullIndexConstructor;
import nu.marginalia.index.reverse.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.searchset.DomainRankings;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mqapi.ProcessInboxNames;
import nu.marginalia.mqapi.index.CreateIndexRequest;
import nu.marginalia.mqapi.index.IndexName;
import nu.marginalia.process.ProcessConfiguration;
import nu.marginalia.process.ProcessConfigurationModule;
import nu.marginalia.process.ProcessMainClass;
@@ -25,11 +27,9 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
public class IndexConstructorMain extends ProcessMainClass {
private final FileStorageService fileStorageService;
private final ProcessHeartbeatImpl heartbeat;
@@ -37,7 +37,7 @@ public class IndexConstructorMain extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class);
public static void main(String[] args) throws Exception {
static void main(String[] args) throws Exception {
Instructions<CreateIndexRequest> instructions = null;
try {
new org.mariadb.jdbc.Driver();
@@ -74,20 +74,20 @@ public class IndexConstructorMain extends ProcessMainClass {
ProcessConfiguration processConfiguration,
DomainRankings domainRankings) {
super(messageQueueFactory, processConfiguration, GsonFactory.get(), INDEX_CONSTRUCTOR_INBOX);
super(messageQueueFactory, processConfiguration, GsonFactory.get(), ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX);
this.fileStorageService = fileStorageService;
this.heartbeat = heartbeat;
this.domainRankings = domainRankings;
}
private void run(CreateIndexRequest instructions) throws SQLException, IOException {
private void run(CreateIndexRequest instructions) throws IOException {
heartbeat.start();
switch (instructions.indexName()) {
case FORWARD -> createForwardIndex();
case REVERSE_FULL -> createFullReverseIndex();
case REVERSE_PRIO -> createPrioReverseIndex();
case IndexName.FORWARD -> createForwardIndex();
case IndexName.REVERSE_FULL -> createFullReverseIndex();
case IndexName.REVERSE_PRIO -> createPrioReverseIndex();
}
heartbeat.shutDown();
@@ -95,50 +95,74 @@ public class IndexConstructorMain extends ProcessMainClass {
private void createFullReverseIndex() throws IOException {
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFileDocs = findNextFile(new IndexFileName.FullDocs());
Path outputFilePositions = findNextFile(new IndexFileName.FullPositions());
Files.deleteIfExists(outputFileDocs);
Files.deleteIfExists(outputFilePositions);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
this::addRankToIdEncoding,
tmpDir);
Set<String> languageIsoCodes = IndexJournal.findJournal(workDir)
.map(IndexJournal::languages)
.orElseGet(Set::of);
constructor.createReverseIndex(heartbeat, "createReverseIndexFull", workDir);
for (String languageIsoCode : languageIsoCodes) {
Path outputFileWords = findNextFile(new IndexFileName.FullWords(languageIsoCode));
FullIndexConstructor constructor = new FullIndexConstructor(
languageIsoCode,
outputFileDocs,
outputFileWords,
outputFilePositions,
this::addRankToIdEncoding,
tmpDir);
String processName = "createReverseIndexFull[%s]".formatted(languageIsoCode);
constructor.createReverseIndex(heartbeat, processName, workDir);
}
}
private void createPrioReverseIndex() throws IOException {
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path outputFileDocs = findNextFile(new IndexFileName.PrioDocs());
Files.deleteIfExists(outputFileDocs);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
var constructor = new PrioIndexConstructor(
outputFileDocs,
outputFileWords,
this::addRankToIdEncoding,
tmpDir);
Set<String> languageIsoCodes = IndexJournal.findJournal(workDir)
.map(IndexJournal::languages)
.orElseGet(Set::of);
constructor.createReverseIndex(heartbeat, "createReverseIndexPrio", workDir);
for (String languageIsoCode : languageIsoCodes) {
Path outputFileWords = findNextFile(new IndexFileName.PrioWords(languageIsoCode));
Files.deleteIfExists(outputFileWords);
PrioIndexConstructor constructor = new PrioIndexConstructor(
languageIsoCode,
outputFileDocs,
outputFileWords,
this::addRankToIdEncoding,
tmpDir);
String processName = "createReverseIndexPrio[%s]".formatted(languageIsoCode);
constructor.createReverseIndex(heartbeat, processName, workDir);
}
}
private void createForwardIndex() throws IOException {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsId = findNextFile(new IndexFileName.ForwardDocIds());
Path outputFileDocsData = findNextFile(new IndexFileName.ForwardDocIds());
Path outputFileSpansData = findNextFile(new IndexFileName.ForwardSpansData());
ForwardIndexConverter converter = new ForwardIndexConverter(heartbeat,
outputFileDocsId,
@@ -151,6 +175,10 @@ public class IndexConstructorMain extends ProcessMainClass {
converter.convert();
}
private Path findNextFile(IndexFileName fileName) {
return IndexFileName.resolve(IndexLocations.getCurrentIndex(fileStorageService), fileName, IndexFileName.Version.NEXT);
}
/** Append the domain's ranking to the high bits of a document ID
* to ensure they're sorted in order of rank within the index.
*/

View File

@@ -4,7 +4,7 @@ import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.Singleton;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.searchset.DomainRankings;
import nu.marginalia.storage.FileStorageService;
public class IndexConstructorModule extends AbstractModule {

View File

@@ -3,27 +3,34 @@ package nu.marginalia.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.config.IndexFileName;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.reverse.FullReverseIndexReader;
import nu.marginalia.index.reverse.PrioReverseIndexReader;
import nu.marginalia.index.reverse.WordLexicon;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.storage.FileStorageService;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;
@Singleton
public class IndexFactory {
private final FileStorageService fileStorageService;
private final Path liveStorage;
private final LanguageConfiguration languageConfiguration;
@Inject
public IndexFactory(FileStorageService fileStorageService) {
public IndexFactory(FileStorageService fileStorageService, LanguageConfiguration languageConfiguration) {
this.fileStorageService = fileStorageService;
this.liveStorage = IndexLocations.getCurrentIndex(fileStorageService);
this.languageConfiguration = languageConfiguration;
}
public CombinedIndexReader getCombinedIndexReader() throws IOException {
@@ -39,47 +46,78 @@ public class IndexFactory {
}
public FullReverseIndexReader getReverseIndexReader() throws IOException {
Path docsFile = getCurrentPath(new IndexFileName.FullDocs());
Path positionsFile = getCurrentPath(new IndexFileName.FullPositions());
List<WordLexicon> wordLexicons = new ArrayList<>();
for (LanguageDefinition languageDefinition : languageConfiguration.languages()) {
String languageIsoCode = languageDefinition.isoCode();
Path wordsFile = getCurrentPath(new IndexFileName.FullWords(languageIsoCode));
if (Files.exists(wordsFile)) {
wordLexicons.add(new WordLexicon(languageIsoCode, wordsFile));
}
}
return new FullReverseIndexReader("full",
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
wordLexicons,
docsFile,
positionsFile
);
}
public PrioReverseIndexReader getReverseIndexPrioReader() throws IOException {
return new PrioReverseIndexReader("prio",
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
);
List<WordLexicon> wordLexicons = new ArrayList<>();
for (LanguageDefinition languageDefinition : languageConfiguration.languages()) {
String languageIsoCode = languageDefinition.isoCode();
Path wordsFile = getCurrentPath(new IndexFileName.PrioWords(languageIsoCode));
if (Files.exists(wordsFile)) {
wordLexicons.add(new WordLexicon(languageIsoCode, wordsFile));
}
}
Path docsFile = getCurrentPath(new IndexFileName.PrioDocs());
return new PrioReverseIndexReader("prio", wordLexicons, docsFile);
}
public ForwardIndexReader getForwardIndexReader() throws IOException {
return new ForwardIndexReader(
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT),
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT),
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.CURRENT)
);
Path docIdsFile = getCurrentPath(new IndexFileName.ForwardDocIds());
Path docDataFile = getCurrentPath(new IndexFileName.ForwardDocData());
Path spansFile = getCurrentPath(new IndexFileName.ForwardSpansData());
return new ForwardIndexReader(docIdsFile, docDataFile, spansFile);
}
private Path getCurrentPath(IndexFileName fileName) {
return IndexFileName.resolve(liveStorage, fileName, IndexFileName.Version.CURRENT);
}
/** Switches the current index to the next index */
public void switchFiles() throws IOException {
for (var file : ReverseIndexFullFileNames.FileIdentifier.values()) {
for (var file : IndexFileName.forwardIndexFiles()) {
switchFile(
ReverseIndexFullFileNames.resolve(liveStorage, file, ReverseIndexFullFileNames.FileVersion.NEXT),
ReverseIndexFullFileNames.resolve(liveStorage, file, ReverseIndexFullFileNames.FileVersion.CURRENT)
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
);
}
for (var file : ReverseIndexPrioFileNames.FileIdentifier.values()) {
for (IndexFileName file : IndexFileName.revPrioIndexFiles(languageConfiguration)) {
switchFile(
ReverseIndexPrioFileNames.resolve(liveStorage, file, ReverseIndexPrioFileNames.FileVersion.NEXT),
ReverseIndexPrioFileNames.resolve(liveStorage, file, ReverseIndexPrioFileNames.FileVersion.CURRENT)
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
);
}
for (var file : ForwardIndexFileNames.FileIdentifier.values()) {
for (IndexFileName file : IndexFileName.revFullIndexFiles(languageConfiguration)) {
switchFile(
ForwardIndexFileNames.resolve(liveStorage, file, ForwardIndexFileNames.FileVersion.NEXT),
ForwardIndexFileNames.resolve(liveStorage, file, ForwardIndexFileNames.FileVersion.CURRENT)
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
);
}
}

View File

@@ -5,29 +5,19 @@ import com.google.inject.Singleton;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import io.prometheus.client.Counter;
import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.searchset.SmallSearchSet;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger;
@@ -35,14 +25,9 @@ import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.util.BitSet;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.Map;
@Singleton
public class IndexGrpcService
@@ -51,6 +36,7 @@ public class IndexGrpcService
{
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Map<String, KeywordHasher> keywordHasherByLangIso;
// This marker is used to mark sensitive log messages that are related to queries
// so that they can be filtered out in the production logging configuration
@@ -61,11 +47,6 @@ public class IndexGrpcService
.help("Query timeout counter")
.labelNames("node", "api")
.register();
private static final Gauge wmsa_query_cost = Gauge.build()
.name("wmsa_index_query_cost")
.help("Computational cost of query")
.labelNames("node", "api")
.register();
private static final Histogram wmsa_query_time = Histogram.build()
.name("wmsa_index_query_time")
.linearBuckets(0.05, 0.05, 15)
@@ -73,55 +54,60 @@ public class IndexGrpcService
.help("Index-side query time")
.register();
private static final Gauge wmsa_index_query_exec_stall_time = Gauge.build()
.name("wmsa_index_query_exec_stall_time")
.help("Execution stall time")
.labelNames("node")
.register();
private static final Gauge wmsa_index_query_exec_block_time = Gauge.build()
.name("wmsa_index_query_exec_block_time")
.help("Execution stall time")
.labelNames("node")
.register();
private final StatefulIndex statefulIndex;
private final SearchSetsService searchSetsService;
private final IndexResultRankingService resultValuator;
private final IndexResultRankingService rankingService;
private final String nodeName;
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
private final int nodeId;
@Inject
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
LanguageConfiguration languageConfiguration,
StatefulIndex statefulIndex,
SearchSetsService searchSetsService,
IndexResultRankingService resultValuator)
IndexResultRankingService rankingService)
{
var nodeId = serviceConfiguration.node();
this.nodeId = serviceConfiguration.node();
this.nodeName = Integer.toString(nodeId);
this.statefulIndex = statefulIndex;
this.searchSetsService = searchSetsService;
this.resultValuator = resultValuator;
this.rankingService = rankingService;
this.keywordHasherByLangIso = new HashMap<>();
for (LanguageDefinition definition : languageConfiguration.languages()) {
keywordHasherByLangIso.put(definition.isoCode(), definition.keywordHasher());
}
}
// GRPC endpoint
public void query(RpcIndexQuery request,
StreamObserver<RpcDecoratedResultItem> responseObserver) {
try {
var params = new SearchParameters(request, getSearchSet(request));
long endTime = System.currentTimeMillis() + request.getQueryLimits().getTimeoutMs();
KeywordHasher hasher = findHasher(request);
List<RpcDecoratedResultItem> results = wmsa_query_time
.labels(nodeName, "GRPC")
.time(() -> {
// Perform the search
try {
return executeSearch(params);
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
CombinedIndexReader indexReader = statefulIndex.get();
SearchContext rankingContext =
SearchContext.create(indexReader, hasher, request, getSearchSet(request));
IndexQueryExecution queryExecution =
new IndexQueryExecution(indexReader, rankingService, rankingContext, nodeId);
return queryExecution.run();
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
@@ -129,11 +115,6 @@ public class IndexGrpcService
}
});
// Prometheus bookkeeping
wmsa_query_cost
.labels(nodeName, "GRPC")
.set(params.getDataCost());
if (System.currentTimeMillis() >= endTime) {
wmsa_query_timeouts
.labels(nodeName, "GRPC")
@@ -153,11 +134,36 @@ public class IndexGrpcService
}
}
/** Keywords are translated to a numeric format via a 64 bit hash algorithm,
* which varies depends on the language.
*/
private KeywordHasher findHasher(RpcIndexQuery request) {
KeywordHasher hasher = keywordHasherByLangIso.get(request.getLangIsoCode());
if (hasher != null)
return hasher;
hasher = keywordHasherByLangIso.get("en");
if (hasher != null)
return hasher;
throw new IllegalStateException("Could not find fallback keyword hasher for iso code 'en'");
}
// exists for test access
public List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
try {
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
CombinedIndexReader currentIndex = statefulIndex.get();
SearchContext context = SearchContext.create(currentIndex,
keywordHasherByLangIso.get("en"), specsSet, getSearchSet(specsSet));
return new IndexQueryExecution(currentIndex, rankingService, context, 1).run();
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
@@ -183,262 +189,6 @@ public class IndexGrpcService
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
}
// accessible for tests
public List<RpcDecoratedResultItem> executeSearch(SearchParameters params) throws Exception {
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
params.compiledQuery,
params.compiledQueryIds);
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
List<RpcDecoratedResultItem> ret = queryExecution.run(params);
wmsa_index_query_exec_block_time
.labels(nodeName)
.set(queryExecution.getBlockTime() / 1000.);
wmsa_index_query_exec_stall_time
.labels(nodeName)
.set(queryExecution.getStallTime() / 1000.);
return ret;
}
/** This class is responsible for ranking the results and adding the best results to the
* resultHeap, which depending on the state of the indexLookup threads may or may not block
*/
private ResultRankingContext createRankingContext(RpcResultRankingParameters rankingParams,
CompiledQuery<String> compiledQuery,
CompiledQueryLong compiledQueryIds)
{
int[] full = new int[compiledQueryIds.size()];
int[] prio = new int[compiledQueryIds.size()];
BitSet ngramsMask = new BitSet(compiledQuery.size());
BitSet regularMask = new BitSet(compiledQuery.size());
var currentIndex = statefulIndex.get();
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
full[idx] = currentIndex.numHits(id);
prio[idx] = currentIndex.numHitsPrio(id);
if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx);
}
else {
regularMask.set(idx);
}
}
return new ResultRankingContext(currentIndex.totalDocCount(),
rankingParams,
ngramsMask,
regularMask,
new CqDataInt(full),
new CqDataInt(prio));
}
/** This class is responsible for executing a search query. It uses a thread pool to
* execute the subqueries and their valuation in parallel. The results are then combined
* into a bounded priority queue, and finally the best results are returned.
*/
private class QueryExecution {
private static final Executor workerPool = Executors.newCachedThreadPool();
/** The queue where the results from the index lookup threads are placed,
* pending ranking by the result ranker threads */
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
= new ArrayBlockingQueue<>(64);
private final ResultPriorityQueue resultHeap;
private final ResultRankingContext resultRankingContext;
private final AtomicInteger remainingIndexTasks = new AtomicInteger(0);
private final AtomicInteger remainingValuationTasks = new AtomicInteger(0);
private final AtomicLong blockTime = new AtomicLong(0);
private final AtomicLong stallTime = new AtomicLong(0);
public long getStallTime() {
return stallTime.get();
}
public long getBlockTime() {
return blockTime.get();
}
private QueryExecution(ResultRankingContext resultRankingContext, int maxResults) {
this.resultRankingContext = resultRankingContext;
this.resultHeap = new ResultPriorityQueue(maxResults);
}
/** Execute a search query */
public List<RpcDecoratedResultItem> run(SearchParameters parameters) throws Exception {
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
var currentIndex = statefulIndex.get();
for (var indexQuery : currentIndex.createQueries(terms, parameters.queryParams)) {
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
}
for (int i = 0; i < indexValuationThreads; i++) {
workerPool.execute(new ResultRanker(parameters, resultRankingContext));
}
// Wait for all tasks to complete
awaitCompletion();
// Return the best results
return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap);
}
/** Wait for all tasks to complete */
private void awaitCompletion() throws InterruptedException {
synchronized (remainingValuationTasks) {
while (remainingValuationTasks.get() > 0) {
remainingValuationTasks.wait(20);
}
}
}
/** This class is responsible for executing a subquery and adding the results to the
* resultCandidateQueue, which depending on the state of the valuator threads may
* or may not block */
class IndexLookup implements Runnable {
private final IndexQuery query;
private final IndexSearchBudget budget;
IndexLookup(IndexQuery query,
IndexSearchBudget budget) {
this.query = query;
this.budget = budget;
remainingIndexTasks.incrementAndGet();
}
public void run() {
try {
executeSearch();
}
catch (Exception ex) {
logger.error("Error in index lookup", ex);
}
finally {
synchronized (remainingIndexTasks) {
if (remainingIndexTasks.decrementAndGet() == 0) {
remainingIndexTasks.notifyAll();
}
}
}
}
private void executeSearch() {
final LongArrayList results = new LongArrayList(16);
// These queries are different indices for one subquery
final LongQueryBuffer buffer = new LongQueryBuffer(4096);
while (query.hasMore() && budget.hasTimeLeft())
{
buffer.reset();
query.getMoreResults(buffer);
for (int i = 0; i < buffer.end; i+=16) {
for (int j = 0; j < Math.min(buffer.end - i, 16); j++) {
results.add(buffer.data.get(i+j));
}
enqueueResults(new CombinedDocIdList(results));
results.clear();
}
}
buffer.dispose();
}
private void enqueueResults(CombinedDocIdList resultIds) {
long remainingTime = budget.timeLeft();
try {
if (!resultCandidateQueue.offer(resultIds)) {
long start = System.currentTimeMillis();
resultCandidateQueue.offer(resultIds, remainingTime, TimeUnit.MILLISECONDS);
blockTime.addAndGet(System.currentTimeMillis() - start);
}
}
catch (InterruptedException e) {
logger.warn("Interrupted while waiting to offer resultIds to queue", e);
}
}
}
class ResultRanker implements Runnable {
private final SearchParameters parameters;
private final ResultRankingContext rankingContext;
ResultRanker(SearchParameters parameters, ResultRankingContext rankingContext) {
this.parameters = parameters;
this.rankingContext = rankingContext;
remainingValuationTasks.incrementAndGet();
}
public void run() {
try {
while (parameters.budget.timeLeft() > 0 && execute());
}
catch (InterruptedException e) {
logger.warn("Interrupted while waiting to poll resultIds from queue", e);
}
catch (Exception e) {
logger.error("Exception while ranking results", e);
}
finally {
synchronized (remainingValuationTasks) {
if (remainingValuationTasks.decrementAndGet() == 0)
remainingValuationTasks.notifyAll();
}
}
}
private boolean execute() throws Exception {
long start = System.currentTimeMillis();
// Do a relatively short poll to ensure we terminate in a timely manner
// in the event all work is done
final long pollTime = Math.clamp(parameters.budget.timeLeft(), 1, 5);
CombinedDocIdList resultIds = resultCandidateQueue.poll(pollTime, TimeUnit.MILLISECONDS);
if (resultIds == null) {
// check if we are done and can terminate
if (remainingIndexTasks.get() == 0 && resultCandidateQueue.isEmpty()) {
return false;
}
}
else {
stallTime.addAndGet(System.currentTimeMillis() - start);
resultHeap.addAll(
resultValuator.rankResults(parameters, false, rankingContext, resultIds)
);
}
return true; // keep going
}
}
}
}

View File

@@ -1,14 +1,14 @@
package nu.marginalia.index.index;
package nu.marginalia.index;
import java.util.List;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.reverse.FullReverseIndexReader;
import nu.marginalia.index.reverse.IndexLanguageContext;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
public class IndexQueryBuilder {
private final IndexLanguageContext context;
private final IndexQuery query;
private final FullReverseIndexReader reverseIndexFullReader;
@@ -20,8 +20,9 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
* */
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
IndexQueryBuilderImpl(FullReverseIndexReader reverseIndexFullReader, IndexQuery query)
IndexQueryBuilder(FullReverseIndexReader reverseIndexFullReader, IndexLanguageContext context, IndexQuery query)
{
this.context = context;
this.query = query;
this.reverseIndexFullReader = reverseIndexFullReader;
}
@@ -32,18 +33,18 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
return this;
}
public IndexQueryBuilder also(long termId) {
public IndexQueryBuilder also(long termId, IndexSearchBudget budget) {
if (alreadyConsideredTerms.add(termId)) {
query.addInclusionFilter(reverseIndexFullReader.also(termId));
query.addInclusionFilter(reverseIndexFullReader.also(context, termId, budget));
}
return this;
}
public IndexQueryBuilder not(long termId) {
public IndexQueryBuilder not(long termId, IndexSearchBudget budget) {
query.addInclusionFilter(reverseIndexFullReader.not(termId));
query.addInclusionFilter(reverseIndexFullReader.not(context, termId, budget));
return this;
}
@@ -55,20 +56,6 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
return this;
}
public IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterSteps) {
if (filterSteps.isEmpty())
return this;
if (filterSteps.size() == 1) {
query.addInclusionFilter(filterSteps.getFirst());
}
else {
query.addInclusionFilter(new QueryFilterAnyOf(filterSteps));
}
return this;
}
public IndexQuery build() {
return query;
}

View File

@@ -0,0 +1,252 @@
package nu.marginalia.index;
import io.prometheus.client.Gauge;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.skiplist.SkipListConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.*;
/** Performs an index query */
public class IndexQueryExecution {
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 8);
private static final int indexPreparationThreads = Integer.getInteger("index.preparationThreads", 2);
// Since most NVMe drives have a maximum read size of 128 KB, and most small reads are 512B
// this should probably be 128*1024 / 512 = 256 to reduce queue depth and optimize tail latency
private static final int evaluationBatchSize = 256;
// This should probably be SkipListConstants.BLOCK_SIZE / 16 in order to reduce the number of unnecessary read
// operations per lookup and again optimize tail latency
private static final int lookupBatchSize = SkipListConstants.BLOCK_SIZE / 16;
private static final ExecutorService threadPool =
new ThreadPoolExecutor(indexValuationThreads, 256,
60L, TimeUnit.SECONDS, new SynchronousQueue<>());
private static final Logger log = LoggerFactory.getLogger(IndexQueryExecution.class);
private final String nodeName;
private final IndexResultRankingService rankingService;
private final SearchContext rankingContext;
private final List<IndexQuery> queries;
private final IndexSearchBudget budget;
private final ResultPriorityQueue resultHeap;
private final CountDownLatch lookupCountdown;
private final CountDownLatch preparationCountdown;
private final CountDownLatch rankingCountdown;
private final ArrayBlockingQueue<CombinedDocIdList> fullPreparationQueue = new ArrayBlockingQueue<>(1);
private final ArrayBlockingQueue<CombinedDocIdList> priorityPreparationQueue = new ArrayBlockingQueue<>(1);
private final ArrayBlockingQueue<IndexResultRankingService.RankingData> fullEvaluationQueue = new ArrayBlockingQueue<>(32);
private final ArrayBlockingQueue<IndexResultRankingService.RankingData> priorityEvaluationQueue = new ArrayBlockingQueue<>(32);
private final int limitTotal;
private final int limitByDomain;
private static final Gauge metric_index_lookup_time_s = Gauge.build()
.labelNames("node")
.name("index_exec_lookup_time_s")
.help("Time in query spent on lookups")
.register();
private static final Gauge metric_index_prep_time_s = Gauge.build()
.labelNames("node")
.name("index_exec_prep_time_s")
.help("Time in query spent retrieving positions and spans")
.register();
private static final Gauge metric_index_rank_time_s = Gauge.build()
.labelNames("node")
.name("index_exec_ranking_time_s")
.help("Time in query spent on ranking")
.register();
private static final Gauge metric_index_documents_ranked = Gauge.build()
.labelNames("node")
.name("index_exec_documents_ranked")
.help("Number of documents ranked")
.register();
public IndexQueryExecution(CombinedIndexReader currentIndex,
IndexResultRankingService rankingService,
SearchContext rankingContext,
int serviceNode) {
this.nodeName = Integer.toString(serviceNode);
this.rankingService = rankingService;
this.rankingContext = rankingContext;
resultHeap = new ResultPriorityQueue(rankingContext.fetchSize);
budget = rankingContext.budget;
limitByDomain = rankingContext.limitByDomain;
limitTotal = rankingContext.limitTotal;
queries = currentIndex.createQueries(rankingContext);
lookupCountdown = new CountDownLatch(queries.size());
preparationCountdown = new CountDownLatch(indexPreparationThreads * 2);
rankingCountdown = new CountDownLatch(indexValuationThreads * 2);
}
public List<RpcDecoratedResultItem> run() throws InterruptedException, SQLException {
for (IndexQuery query : queries) {
threadPool.submit(() -> lookup(query));
}
for (int i = 0; i < indexPreparationThreads; i++) {
threadPool.submit(() -> prepare(priorityPreparationQueue, priorityEvaluationQueue));
threadPool.submit(() -> prepare(fullPreparationQueue, fullEvaluationQueue));
}
// Spawn lookup tasks for each query
for (int i = 0; i < indexValuationThreads; i++) {
threadPool.submit(() -> evaluate(priorityEvaluationQueue));
threadPool.submit(() -> evaluate(fullEvaluationQueue));
}
// Await lookup task termination
lookupCountdown.await();
preparationCountdown.await();
rankingCountdown.await();
// Deallocate any leftover ranking data buffers
for (var data : priorityEvaluationQueue) {
data.close();
}
for (var data : fullEvaluationQueue) {
data.close();
}
metric_index_documents_ranked
.labels(nodeName)
.inc(1000. * resultHeap.getItemsProcessed() / budget.getLimitTime());
// Final result selection
return rankingService.selectBestResults(limitByDomain, limitTotal, rankingContext, resultHeap.toList());
}
private List<Future<?>> lookup(IndexQuery query) {
final LongQueryBuffer buffer = new LongQueryBuffer(lookupBatchSize);
List<Future<?>> evaluationJobs = new ArrayList<>();
try {
while (query.hasMore() && budget.hasTimeLeft()) {
buffer.zero();
long st = System.nanoTime();
query.getMoreResults(buffer);
long et = System.nanoTime();
metric_index_lookup_time_s
.labels(nodeName)
.inc((et - st)/1_000_000_000.);
if (buffer.isEmpty())
continue;
var queue = query.isPrioritized() ? priorityPreparationQueue : fullPreparationQueue;
if (buffer.end <= evaluationBatchSize) {
var docIds = new CombinedDocIdList(buffer);
if (!queue.offer(docIds, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
break;
}
else {
long[] bufferData = buffer.copyData();
for (int start = 0; start < bufferData.length; start+= evaluationBatchSize) {
long[] slice = Arrays.copyOfRange(bufferData, start,
Math.min(start + evaluationBatchSize, bufferData.length));
var docIds = new CombinedDocIdList(slice);
if (!queue.offer(docIds, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
break;
}
}
}
} catch (RuntimeException | InterruptedException ex) {
log.error("Exception in lookup thread", ex);
} finally {
buffer.dispose();
lookupCountdown.countDown();
}
return evaluationJobs;
}
private void prepare(ArrayBlockingQueue<CombinedDocIdList> inputQueue, ArrayBlockingQueue<IndexResultRankingService.RankingData> outputQueue) {
try {
while (budget.hasTimeLeft() && (lookupCountdown.getCount() > 0 || !inputQueue.isEmpty())) {
var docIds = inputQueue.poll(Math.clamp(budget.timeLeft(), 1, 5), TimeUnit.MILLISECONDS);
if (docIds == null) continue;
long st = System.nanoTime();
var preparedData = rankingService.prepareRankingData(rankingContext, docIds);
long et = System.nanoTime();
metric_index_prep_time_s
.labels(nodeName)
.inc((et - st)/1_000_000_000.);
if (!outputQueue.offer(preparedData, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
preparedData.close();
}
} catch (TimeoutException ex) {
// This is normal
} catch (Exception ex) {
if (!(ex.getCause() instanceof InterruptedException)) {
log.error("Exception in lookup thread", ex);
} // suppress logging for interrupted ex
} finally {
preparationCountdown.countDown();
}
}
private void evaluate(ArrayBlockingQueue<IndexResultRankingService.RankingData> queue) {
try {
while (budget.hasTimeLeft() && (preparationCountdown.getCount() > 0 || !queue.isEmpty())) {
var rankingData = queue.poll(Math.clamp(budget.timeLeft(), 1, 5), TimeUnit.MILLISECONDS);
if (rankingData == null) continue;
try (rankingData) {
long st = System.nanoTime();
resultHeap.addAll(rankingService.rankResults(rankingContext, rankingData, false));
long et = System.nanoTime();
metric_index_rank_time_s
.labels(nodeName)
.inc((et - st)/1_000_000_000.);
}
}
} catch (Exception ex) {
if (!(ex.getCause() instanceof InterruptedException)) {
log.error("Exception in lookup thread", ex);
} // suppress logging for interrupted ex
} finally {
rankingCountdown.countDown();
}
}
public int itemsProcessed() {
return resultHeap.getItemsProcessed();
}
}

View File

@@ -1,5 +1,6 @@
package nu.marginalia.index;
import com.google.common.collect.MinMaxPriorityQueue;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import org.jetbrains.annotations.NotNull;
@@ -9,108 +10,52 @@ import java.util.*;
/** A priority queue for search results. This class is not thread-safe,
* in general, except for concurrent use of the addAll method.
* <p></p>
* The class implements a subset of the Collection interface, and
* is intended to be used as a priority queue for search results,
* with a maximum size.
* <p></p>
* Since the expected use case is to add a large number of items
* and then iterate over the items, the class is optimized for
* this scenario, and does not implement other mutating methods
* than addAll().
*/
public class ResultPriorityQueue implements Iterable<SearchResultItem>,
Collection<SearchResultItem> {
private final int limit;
private final ArrayList<SearchResultItem> backingList = new ArrayList<>();
public class ResultPriorityQueue implements Iterable<SearchResultItem> {
private final LongOpenHashSet idsInSet = new LongOpenHashSet();
private final MinMaxPriorityQueue<SearchResultItem> queue;
private int itemsProcessed = 0;
public ResultPriorityQueue(int limit) {
this.limit = limit;
this.queue = MinMaxPriorityQueue.<SearchResultItem>orderedBy(Comparator.naturalOrder()).maximumSize(limit).create();
}
public Iterator<SearchResultItem> iterator() {
return backingList.iterator();
}
@NotNull
@Override
public Object[] toArray() {
return backingList.toArray();
}
@NotNull
@Override
public <T> T[] toArray(@NotNull T[] a) {
return backingList.toArray(a);
}
@Override
public boolean add(SearchResultItem searchResultItem) {
throw new UnsupportedOperationException("Use addAll instead");
}
@Override
public boolean remove(Object o) {
throw new UnsupportedOperationException();
}
@Override
public boolean containsAll(@NotNull Collection<?> c) {
return idsInSet.containsAll(c);
public @NotNull Iterator<SearchResultItem> iterator() {
return queue.iterator();
}
/** Adds all items to the queue, and returns true if any items were added.
* This is a thread-safe operation.
*/
@Override
public synchronized boolean addAll(@NotNull Collection<? extends SearchResultItem> items) {
boolean itemsAdded = false;
for (var item: items) {
if (idsInSet.add(item.getDocumentId())) {
backingList.add(item);
itemsAdded = true;
}
}
if (!itemsAdded) {
return false;
}
itemsProcessed+=items.size();
backingList.sort(Comparator.naturalOrder());
if (backingList.size() > limit) {
backingList.subList(limit, backingList.size()).clear();
for (var item : items) {
if (idsInSet.add(item.getDocumentId())) {
queue.add(item);
}
}
return true;
}
@Override
public boolean removeAll(@NotNull Collection<?> c) {
throw new UnsupportedOperationException();
}
@Override
public boolean retainAll(@NotNull Collection<?> c) {
throw new UnsupportedOperationException();
}
@Override
public void clear() {
backingList.clear();
idsInSet.clear();
public synchronized List<SearchResultItem> toList() {
return new ArrayList<>(queue);
}
public int size() {
return backingList.size();
return queue.size();
}
public int getItemsProcessed() {
return itemsProcessed;
}
@Override
public boolean isEmpty() {
return backingList.isEmpty();
}
@Override
public boolean contains(Object o) {
return backingList.contains(o);
return queue.isEmpty();
}
}

View File

@@ -1,8 +1,7 @@
package nu.marginalia.index.index;
package nu.marginalia.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.index.IndexFactory;
import nu.marginalia.service.control.ServiceEventLog;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
@@ -35,6 +34,13 @@ public class StatefulIndex {
this.eventLog = eventLog;
}
/** For use in testing only */
public StatefulIndex(CombinedIndexReader combinedIndexReader) {
this.combinedIndexReader = combinedIndexReader;
this.servicesFactory = null;
this.eventLog = null;
}
public void init() {
Lock lock = indexReplacementLock.writeLock();

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.forward;
package nu.marginalia.index.config;
public class ForwardIndexParameters {
public static final int ENTRY_SIZE = 3;

View File

@@ -0,0 +1,97 @@
package nu.marginalia.index.config;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.LanguageDefinition;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public sealed interface IndexFileName {
enum Version {
CURRENT, NEXT
}
record FullWords(String languageIsoCode) implements IndexFileName {}
record FullDocs() implements IndexFileName {}
record FullPositions() implements IndexFileName {}
record PrioWords(String languageIsoCode) implements IndexFileName {}
record PrioDocs() implements IndexFileName {}
record ForwardDocIds() implements IndexFileName { }
record ForwardDocData() implements IndexFileName { }
record ForwardSpansData() implements IndexFileName { }
static List<IndexFileName> revFullIndexFiles(LanguageConfiguration languageConfiguration) {
List<IndexFileName> ret = new ArrayList<>();
ret.add(new FullDocs());
ret.add(new FullPositions());
for (LanguageDefinition ld : languageConfiguration.languages()) {
ret.add(new FullWords(ld.isoCode()));
}
return ret;
}
static List<IndexFileName> revPrioIndexFiles(LanguageConfiguration languageConfiguration) {
List<IndexFileName> ret = new ArrayList<>();
ret.add(new PrioDocs());
for (LanguageDefinition ld : languageConfiguration.languages()) {
ret.add(new PrioWords(ld.isoCode()));
}
return ret;
}
static List<IndexFileName> forwardIndexFiles() {
return List.of(
new ForwardDocData(),
new ForwardDocIds(),
new ForwardSpansData()
);
}
static Path resolve(Path basePath, IndexFileName fileName, Version version) {
return switch (fileName) {
case FullWords(String isoCode) -> switch (version) {
case CURRENT -> basePath.resolve("rev-words-%s.dat".formatted(isoCode));
case NEXT -> basePath.resolve("rev-words-%s.dat.next".formatted(isoCode));
};
case FullDocs() -> switch (version) {
case CURRENT -> basePath.resolve("rev-docs.dat");
case NEXT -> basePath.resolve("rev-docs.dat.next");
};
case FullPositions() -> switch (version) {
case CURRENT -> basePath.resolve("rev-positions.dat");
case NEXT -> basePath.resolve("rev-positions.dat.next");
};
case PrioWords(String languageIsoCode) -> switch (version) {
case CURRENT -> basePath.resolve("rev-prio-words-%s.dat".formatted(languageIsoCode));
case NEXT -> basePath.resolve("rev-prio-words-%s.dat.next".formatted(languageIsoCode));
};
case PrioDocs() -> switch (version) {
case CURRENT -> basePath.resolve("rev-prio-docs.dat");
case NEXT -> basePath.resolve("rev-prio-docs.dat.next");
};
case ForwardDocIds() -> switch (version) {
case CURRENT -> basePath.resolve("fwd-doc-ids.dat");
case NEXT -> basePath.resolve("fwd-doc-ids.dat.next");
};
case ForwardDocData() -> switch (version) {
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
};
case ForwardSpansData() -> switch (version) {
case CURRENT -> basePath.resolve("fwd-spans.dat");
case NEXT -> basePath.resolve("fwd-spans.dat.next");
};
};
}
}

View File

@@ -0,0 +1,9 @@
package nu.marginalia.index.config;
import nu.marginalia.btree.model.BTreeBlockSize;
import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexParameters
{
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
}

View File

@@ -1,9 +1,13 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.index.forward.spans.IndexSpansReader;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -12,8 +16,9 @@ import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeoutException;
import static nu.marginalia.index.forward.ForwardIndexParameters.*;
import static nu.marginalia.index.config.ForwardIndexParameters.*;
/** Reads the forward index.
* <p/>
@@ -22,16 +27,15 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
* and a mapping between document identifiers to the index into the
* data array.
* <p/>
* Since the total data is relatively small, this is kept in memory to
* reduce the amount of disk thrashing.
* <p/>
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
*/
public class ForwardIndexReader {
private final LongArray ids;
private final LongArray data;
private final ForwardIndexSpansReader spansReader;
private volatile Long2IntOpenHashMap idsMap;
private final IndexSpansReader spansReader;
private final Logger logger = LoggerFactory.getLogger(getClass());
@@ -64,7 +68,22 @@ public class ForwardIndexReader {
ids = loadIds(idsFile);
data = loadData(dataFile);
spansReader = new ForwardIndexSpansReader(spansFile);
LinuxSystemCalls.madviseRandom(data.getMemorySegment());
LinuxSystemCalls.madviseRandom(ids.getMemorySegment());
spansReader = IndexSpansReader.open(spansFile);
Thread.ofPlatform().start(this::createIdsMap);
}
private void createIdsMap() {
Long2IntOpenHashMap idsMap = new Long2IntOpenHashMap((int) ids.size());
for (int i = 0; i < ids.size(); i++) {
idsMap.put(ids.get(i), i);
}
this.idsMap = idsMap;
logger.info("Forward index loaded into RAM");
}
private static LongArray loadIds(Path idsFile) throws IOException {
@@ -106,7 +125,11 @@ public class ForwardIndexReader {
private int idxForDoc(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = ids.binarySearch(docId, 0, ids.size());
if (idsMap != null) {
return idsMap.getOrDefault(docId, -1);
}
long offset = ids.binarySearch2(docId, 0, ids.size());
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
if (getClass().desiredAssertionStatus()) {
@@ -118,22 +141,29 @@ public class ForwardIndexReader {
return (int) offset;
}
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return new DocumentSpans();
public DocumentSpans[] getDocumentSpans(Arena arena, IndexSearchBudget budget, CombinedDocIdList combinedIds) throws TimeoutException {
long[] offsets = new long[combinedIds.size()];
long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
for (int i = 0; i < offsets.length; i++) {
long docId = UrlIdCodec.removeRank(combinedIds.at(i));
long offset = idxForDoc(docId);
if (offset >= 0) {
offsets[i] = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
}
else {
offsets[i] = -1;
}
}
try {
return spansReader.readSpans(arena, encodedOffset);
return spansReader.readSpans(arena, budget, offsets);
}
catch (IOException ex) {
logger.error("Failed to read spans for doc " + docId, ex);
return new DocumentSpans();
logger.error("Failed to read spans for docIds", ex);
return new DocumentSpans[offsets.length];
}
}
public int totalDocCount() {
return (int) ids.size();
}
@@ -141,6 +171,8 @@ public class ForwardIndexReader {
public void close() {
if (data != null)
data.close();
if (ids != null)
ids.close();
}
public boolean isLoaded() {

View File

@@ -3,10 +3,10 @@ package nu.marginalia.index.forward.construction;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexParameters;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.index.config.ForwardIndexParameters;
import nu.marginalia.index.forward.spans.IndexSpansWriter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.searchset.DomainRankings;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.process.control.ProcessHeartbeat;
@@ -65,7 +65,7 @@ public class ForwardIndexConverter {
logger.info("Domain Rankings size = {}", domainRankings.size());
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter");
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
var spansWriter = new IndexSpansWriter(outputFileSpansData)
) {
progress.progress(TaskSteps.GET_DOC_IDS);

View File

@@ -11,6 +11,9 @@ public class DocumentSpan {
/** A list of the interlaced start and end positions of each span in the document of this type */
private final IntList startsEnds;
public DocumentSpan(IntList startsEnds) {
this.startsEnds = startsEnds;
}
public DocumentSpan(CodedSequence startsEnds) {
this.startsEnds = startsEnds.values();
}

View File

@@ -1,5 +1,6 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.CodedSequence;
@@ -39,6 +40,23 @@ public class DocumentSpans {
return EMPTY_SPAN;
}
void accept(byte code, IntList positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);
else if (code == HtmlTag.TITLE.code)
this.title = new DocumentSpan(positions);
else if (code == HtmlTag.NAV.code)
this.nav = new DocumentSpan(positions);
else if (code == HtmlTag.CODE.code)
this.code = new DocumentSpan(positions);
else if (code == HtmlTag.ANCHOR.code)
this.anchor = new DocumentSpan(positions);
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
this.externalLinkText = new DocumentSpan(positions);
else if (code == HtmlTag.BODY.code)
this.body = new DocumentSpan(positions);
}
void accept(byte code, CodedSequence positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);

View File

@@ -0,0 +1,24 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Path;
import java.util.concurrent.TimeoutException;
public interface IndexSpansReader extends AutoCloseable {
DocumentSpans[] readSpans(Arena arena, IndexSearchBudget budget, long[] encodedOffsets) throws TimeoutException, IOException;
static IndexSpansReader open(Path fileName) throws IOException {
int version = SpansCodec.parseSpanFilesFooter(fileName);
if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
return new IndexSpansReaderPlain(fileName);
}
else {
throw new IllegalArgumentException("Unsupported spans file version: " + version);
}
}
void close() throws IOException;
}

View File

@@ -0,0 +1,100 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.uring.UringFileReader;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.concurrent.TimeoutException;
public class IndexSpansReaderPlain implements IndexSpansReader {
private final UringFileReader uringReader;
public IndexSpansReaderPlain(Path spansFile) throws IOException {
if (Boolean.getBoolean("index.directModePositionsSpans")) {
if ((Files.size(spansFile) & 4095) != 0) {
throw new IllegalArgumentException("Spans file is not block aligned in size: " + Files.size(spansFile));
}
uringReader = new UringFileReader(spansFile, true);
}
else {
uringReader = new UringFileReader(spansFile, false);
uringReader.fadviseWillneed();
}
}
@Override
public DocumentSpans[] readSpans(Arena arena, IndexSearchBudget budget, long[] encodedOffsets) throws TimeoutException {
int readCnt = 0;
for (long offset : encodedOffsets) {
if (offset < 0) continue;
readCnt ++;
}
if (readCnt == 0) {
return new DocumentSpans[encodedOffsets.length];
}
long[] offsets = new long[readCnt];
int[] sizes = new int[readCnt];
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
if (encodedOffsets[idx] < 0)
continue;
long offset = encodedOffsets[idx];
offsets[j] = SpansCodec.decodeStartOffset(offset);
sizes[j] = SpansCodec.decodeSize(offset);
j++;
}
List<MemorySegment> buffers = uringReader.readUnaligned(arena, budget.timeLeft(), offsets, sizes, 4096);
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
if (encodedOffsets[idx] < 0)
continue;
ret[idx] = decode(buffers.get(j++));
}
return ret;
}
public DocumentSpans decode(MemorySegment ms) {
int count = ms.get(ValueLayout.JAVA_INT, 0);
int pos = 4;
DocumentSpans ret = new DocumentSpans();
// Decode each span
for (int spanIdx = 0; spanIdx < count; spanIdx++) {
byte code = ms.get(ValueLayout.JAVA_BYTE, pos);
short len = ms.get(ValueLayout.JAVA_SHORT, pos+2);
IntArrayList values = new IntArrayList(len);
pos += 4;
for (int i = 0; i < len; i++) {
values.add(ms.get(ValueLayout.JAVA_INT, pos + 4*i));
}
ret.accept(code, values);
pos += 4*len;
}
return ret;
}
@Override
public void close() throws IOException {
uringReader.close();
}
}

View File

@@ -0,0 +1,76 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class IndexSpansWriter implements AutoCloseable {
private final FileChannel outputChannel;
private final ByteBuffer work = ByteBuffer.allocate(4*1024*1024).order(ByteOrder.nativeOrder());
private static Logger logger = LoggerFactory.getLogger(IndexSpansWriter.class);
private long stateStartOffset = -1;
private int stateLength = -1;
public IndexSpansWriter(Path outputFileSpansData) throws IOException {
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
}
public void beginRecord(int count) throws IOException {
stateStartOffset = outputChannel.position();
stateLength = 0;
work.clear();
work.putInt(count);
work.flip();
while (work.hasRemaining())
stateLength += outputChannel.write(work);
}
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
work.clear();
work.put(spanCode);
work.put((byte) 0); // Ensure we're byte aligned
var sequence = new VarintCodedSequence(sequenceData);
int spanLength = sequence.valueCount();
if (spanLength > 8192) {
logger.warn("Excessive span length with code {}: {}", spanCode, spanLength);
spanLength = 8192;
}
work.putShort((short) spanLength);
var iter = sequence.iterator();
for (int spanIdx = 0; iter.hasNext() && spanIdx < spanLength; spanIdx++) {
work.putInt(iter.nextInt());
}
work.flip();
stateLength += outputChannel.write(work);
}
public long endRecord() {
return SpansCodec.encode(stateStartOffset, stateLength);
}
@Override
public void close() throws IOException {
ByteBuffer footer = SpansCodec.createSpanFilesFooter(SpansCodec.SpansCodecVersion.PLAIN, (int) (4096 - (outputChannel.position() & 4095)));
outputChannel.position(outputChannel.size());
while (footer.hasRemaining()) {
outputChannel.write(footer, outputChannel.size());
}
outputChannel.close();
}
}

View File

@@ -0,0 +1,64 @@
package nu.marginalia.index.forward.spans;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class SpansCodec {
public static int MAGIC_INT = 0xF000F000;
public static int FOOTER_SIZE = 8;
public enum SpansCodecVersion {
@Deprecated
DEPRECATED_1, // This must not be removed, the ordinal is used to encode the version
PLAIN
}
public static long encode(long startOffset, long size) {
assert size < 0x1000_0000L : "Size must be less than 2^28";
return startOffset << 28 | (size & 0xFFF_FFFFL);
}
public static long decodeStartOffset(long encoded) {
return encoded >>> 28;
}
public static int decodeSize(long encoded) {
return (int) (encoded & 0x0FFF_FFFFL);
}
public static ByteBuffer createSpanFilesFooter(SpansCodecVersion version, int padSize) {
if (padSize < FOOTER_SIZE) {
padSize += 4096;
}
ByteBuffer footer = ByteBuffer.allocate(padSize);
footer.position(padSize - FOOTER_SIZE);
footer.putInt(SpansCodec.MAGIC_INT);
footer.put((byte) version.ordinal());
footer.put((byte) 0);
footer.put((byte) 0);
footer.put((byte) 0);
footer.flip();
return footer;
}
public static int parseSpanFilesFooter(Path spansFile) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(FOOTER_SIZE);
try (var fc = FileChannel.open(spansFile, StandardOpenOption.READ)) {
if (fc.size() < FOOTER_SIZE) return 0;
fc.read(buffer, fc.size() - buffer.capacity());
buffer.flip();
int magic = buffer.getInt();
if (magic != MAGIC_INT) {
return 0;
}
return buffer.get();
}
}
}

View File

@@ -1,14 +1,14 @@
package nu.marginalia.index.results.model.ids;
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import nu.marginalia.array.page.LongQueryBuffer;
import java.util.Arrays;
import java.util.stream.LongStream;
/** A list of document ids, with their ranking bits still remaining.
*
* @see nu.marginalia.index.results.model.ids.DocIdList
* @see DocIdList
* @see nu.marginalia.model.id.UrlIdCodec
* */
public final class CombinedDocIdList {
@@ -17,17 +17,23 @@ public final class CombinedDocIdList {
public CombinedDocIdList(long... data) {
this.data = Arrays.copyOf(data, data.length);
}
public CombinedDocIdList(LongQueryBuffer buffer) {
this.data = buffer.copyData();
}
public CombinedDocIdList(LongArrayList data) {
this.data = data.toLongArray();
}
public CombinedDocIdList(Roaring64Bitmap data) {
this.data = data.toArray();
}
public CombinedDocIdList() {
this.data = new long[0];
}
public static CombinedDocIdList combineLists(CombinedDocIdList one, CombinedDocIdList other) {
long[] data = new long[one.size() + other.size()];
System.arraycopy(one.data, 0, data, 0, one.data.length);
System.arraycopy(other.data, 0, data, one.data.length, other.data.length);
return new CombinedDocIdList(data);
}
public int size() {
return data.length;
}

View File

@@ -1,14 +1,13 @@
package nu.marginalia.index.results.model.ids;
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import java.util.Arrays;
import java.util.Objects;
import java.util.stream.LongStream;
/** A list of document ids, with their ranking bits removed.
*
* @see nu.marginalia.index.results.model.ids.CombinedDocIdList
* @see CombinedDocIdList
* @see nu.marginalia.model.id.UrlIdCodec
* */
public final class DocIdList {

View File

@@ -1,10 +1,9 @@
package nu.marginalia.index.results.model;
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
@@ -58,9 +57,10 @@ public class PhraseConstraintGroupList {
private final int[] offsets;
private final BitSet present;
private final BitSet termIdsMask;
private final int presentCardinality;
public final int size;
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
public PhraseConstraintGroup(KeywordHasher hasher, List<String> terms, TermIdList termIdsAll) {
offsets = new int[terms.size()];
present = new BitSet(terms.size());
size = terms.size();
@@ -74,7 +74,7 @@ public class PhraseConstraintGroupList {
}
present.set(i);
long termId = SearchTermsUtil.getWordId(term);
long termId = hasher.hashKeyword(term);
int idx = termIdsAll.indexOf(termId);
if (idx < 0) {
@@ -85,6 +85,8 @@ public class PhraseConstraintGroupList {
termIdsMask.set(idx);
}
}
presentCardinality = present.cardinality();
}
/** Returns true if the term with index termIdx in the query is in the group */
@@ -93,7 +95,7 @@ public class PhraseConstraintGroupList {
}
public boolean test(CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];
IntIterator[] sequences = new IntIterator[presentCardinality];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
@@ -120,7 +122,7 @@ public class PhraseConstraintGroupList {
public IntList findIntersections(IntList[] positions) {
IntList[] sequences = new IntList[present.cardinality()];
IntList[] sequences = new IntList[presentCardinality];
int[] iterOffsets = new int[sequences.length];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
@@ -144,12 +146,41 @@ public class PhraseConstraintGroupList {
iterOffsets[si - 1] = -oi;
}
return SequenceOperations.findIntersections(sequences, iterOffsets);
return SequenceOperations.findIntersections(sequences, iterOffsets, Integer.MAX_VALUE);
}
public IntList findIntersections(IntList[] positions, int n) {
IntList[] sequences = new IntList[presentCardinality];
int[] iterOffsets = new int[sequences.length];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
continue;
}
int offset = offsets[oi];
if (offset < 0)
return IntList.of();
// Create iterators that are offset by their relative position in the
// sequence. This is done by subtracting the index from the offset,
// so that when we intersect them, an overlap means that the terms are
// in the correct order. Note the offset is negative!
var posForTerm = positions[offset];
if (posForTerm == null) {
return IntList.of();
}
sequences[si++] = posForTerm;
iterOffsets[si - 1] = -oi;
}
return SequenceOperations.findIntersections(sequences, iterOffsets, n);
}
public int minDistance(IntList[] positions) {
List<IntList> sequences = new ArrayList<>(present.cardinality());
IntList iterOffsets = new IntArrayList(present.cardinality());
List<IntList> sequences = new ArrayList<>(presentCardinality);
IntList iterOffsets = new IntArrayList(presentCardinality);
for (int oi = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {

View File

@@ -1,8 +1,9 @@
package nu.marginalia.index.model;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import java.util.Objects;
@@ -41,6 +42,13 @@ public final class QueryParams {
this.queryStrategy = queryStrategy;
}
public boolean imposesDomainMetadataConstraint() {
return qualityLimit.type() != SpecificationLimitType.NONE
|| year.type() != SpecificationLimitType.NONE
|| size.type() != SpecificationLimitType.NONE
|| rank.type() != SpecificationLimitType.NONE;
}
public SpecificationLimit qualityLimit() {
return qualityLimit;
}

View File

@@ -0,0 +1,243 @@
package nu.marginalia.index.model;
import gnu.trove.map.hash.TObjectLongHashMap;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongComparator;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.index.CombinedIndexReader;
import nu.marginalia.index.reverse.IndexLanguageContext;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.language.keywords.KeywordHasher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
public class SearchContext {
private static final Logger logger = LoggerFactory.getLogger(SearchContext.class);
public final IndexSearchBudget budget;
public final int fetchSize;
public final int limitByDomain;
public final int limitTotal;
private final int docCount;
public final RpcResultRankingParameters params;
public final SearchQuery searchQuery;
public final QueryParams queryParams;
public final CompiledQuery<String> compiledQuery;
public final CompiledQueryLong compiledQueryIds;
/** Bitmask whose position correspond to the positions in the compiled query data
* which are regular words.
*/
public final BitSet regularMask;
/** Bitmask whose position correspond to the positions in the compiled query data
* which are ngrams.
*/
public final BitSet ngramsMask;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt fullCounts;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt priorityCounts;
public final TermIdList termIdsAll;
public final PhraseConstraintGroupList phraseConstraints;
public final LongList termIdsAdvice;
public final LongList termIdsExcludes;
public final LongList termIdsPriority;
public final IndexLanguageContext languageContext;
public static SearchContext create(CombinedIndexReader currentIndex,
KeywordHasher keywordHasher,
SearchSpecification specsSet,
SearchSet searchSet) {
var queryParams = new QueryParams(specsSet.quality, specsSet.year, specsSet.size, specsSet.rank, searchSet, specsSet.queryStrategy);
var rankingParams = specsSet.rankingParams;
var limits = specsSet.queryLimits;
return new SearchContext(
keywordHasher,
"en", // FIXME: This path currently only supports english
currentIndex,
specsSet.query.compiledQuery,
queryParams,
specsSet.query,
rankingParams,
limits);
}
public static SearchContext create(CombinedIndexReader currentIndex,
KeywordHasher keywordHasher,
RpcIndexQuery request, SearchSet searchSet) {
var limits = request.getQueryLimits();
var query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
var queryParams = new QueryParams(
convertSpecLimit(request.getQuality()),
convertSpecLimit(request.getYear()),
convertSpecLimit(request.getSize()),
convertSpecLimit(request.getRank()),
searchSet,
QueryStrategy.valueOf(request.getQueryStrategy()));
var rankingParams = request.hasParameters() ? request.getParameters() : PrototypeRankingParameters.sensibleDefaults();
return new SearchContext(
keywordHasher,
request.getLangIsoCode(),
currentIndex,
query.compiledQuery,
queryParams,
query,
rankingParams,
limits);
}
public SearchContext(
KeywordHasher keywordHasher,
String langIsoCode,
CombinedIndexReader currentIndex,
String queryExpression,
QueryParams queryParams,
SearchQuery query,
RpcResultRankingParameters rankingParams,
RpcQueryLimits limits)
{
this.docCount = currentIndex.totalDocCount();
this.languageContext = currentIndex.createLanguageContext(langIsoCode);
this.budget = new IndexSearchBudget(Math.max(limits.getTimeoutMs()/2, limits.getTimeoutMs()-50));
this.searchQuery = query;
this.params = rankingParams;
this.queryParams = queryParams;
this.fetchSize = limits.getFetchSize();
this.limitByDomain = limits.getResultsByDomain();
this.limitTotal = limits.getResultsTotal();
this.compiledQuery = CompiledQueryParser.parse(queryExpression);
this.compiledQueryIds = compiledQuery.mapToLong(keywordHasher::hashKeyword);
int[] full = new int[compiledQueryIds.size()];
int[] prio = new int[compiledQueryIds.size()];
this.ngramsMask = new BitSet(compiledQuery.size());
this.regularMask = new BitSet(compiledQuery.size());
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
full[idx] = currentIndex.numHits(this.languageContext, id);
prio[idx] = currentIndex.numHitsPrio(this.languageContext, id);
if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx);
}
else {
regularMask.set(idx);
}
}
this.fullCounts = new CqDataInt(full);
this.priorityCounts = new CqDataInt(prio);
this.termIdsExcludes = new LongArrayList();
this.termIdsPriority = new LongArrayList();
this.termIdsAdvice = new LongArrayList();
for (var word : searchQuery.searchTermsAdvice) {
termIdsAdvice.add(keywordHasher.hashKeyword(word));
}
for (var word : searchQuery.searchTermsExclude) {
termIdsExcludes.add(keywordHasher.hashKeyword(word));
}
for (var word : searchQuery.searchTermsPriority) {
termIdsPriority.add(keywordHasher.hashKeyword(word));
}
LongArrayList termIdsList = new LongArrayList();
TObjectLongHashMap<Object> termToId = new TObjectLongHashMap<>();
for (String word : compiledQuery) {
long id = keywordHasher.hashKeyword(word);
termIdsList.add(id);
termToId.put(word, id);
}
for (var term : searchQuery.searchTermsPriority) {
if (termToId.containsKey(term)) {
continue;
}
long id = keywordHasher.hashKeyword(term);
termIdsList.add(id);
termToId.put(term, id);
}
termIdsAll = new TermIdList(termIdsList);
var constraintsMandatory = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
var constraintsFull = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
var constraintsOptional = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
for (var constraint : searchQuery.phraseConstraints) {
switch (constraint) {
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
constraintsMandatory.add(new PhraseConstraintGroupList.PhraseConstraintGroup(keywordHasher, terms, termIdsAll));
case SearchPhraseConstraint.Optional(List<String> terms) ->
constraintsOptional.add(new PhraseConstraintGroupList.PhraseConstraintGroup(keywordHasher, terms, termIdsAll));
case SearchPhraseConstraint.Full(List<String> terms) ->
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(keywordHasher, terms, termIdsAll));
}
}
if (constraintsFull.isEmpty()) {
logger.warn("No full constraints in query, adding empty group");
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(keywordHasher, List.of(), termIdsAll));
}
this.phraseConstraints = new PhraseConstraintGroupList(constraintsFull.getFirst(), constraintsMandatory, constraintsOptional);
}
public int termFreqDocCount() {
return docCount;
}
public long[] sortedDistinctIncludes(LongComparator comparator) {
LongList list = new LongArrayList(compiledQueryIds.copyData());
list.sort(comparator);
return list.toLongArray();
}
}

View File

@@ -1,97 +0,0 @@
package nu.marginalia.index.model;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.searchset.SearchSet;
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
public class SearchParameters {
/**
* This is how many results matching the keywords we'll try to get
* before evaluating them for the best result.
*/
public final int fetchSize;
public final IndexSearchBudget budget;
public final SearchQuery query;
public final QueryParams queryParams;
public final RpcResultRankingParameters rankingParams;
public final int limitByDomain;
public final int limitTotal;
public final CompiledQuery<String> compiledQuery;
public final CompiledQueryLong compiledQueryIds;
// mutable:
/**
* An estimate of how much data has been read
*/
public long dataCost = 0;
public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
var limits = specsSet.queryLimits;
this.fetchSize = limits.getFetchSize();
this.budget = new IndexSearchBudget(limits.getTimeoutMs());
this.query = specsSet.query;
this.limitByDomain = limits.getResultsByDomain();
this.limitTotal = limits.getResultsTotal();
queryParams = new QueryParams(
specsSet.quality,
specsSet.year,
specsSet.size,
specsSet.rank,
searchSet,
specsSet.queryStrategy);
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
rankingParams = specsSet.rankingParams;
}
public SearchParameters(RpcIndexQuery request, SearchSet searchSet) {
var limits = request.getQueryLimits();
this.fetchSize = limits.getFetchSize();
// The time budget is halved because this is the point when we start to
// wrap up the search and return the results.
this.budget = new IndexSearchBudget(limits.getTimeoutMs() / 2);
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
this.limitByDomain = limits.getResultsByDomain();
this.limitTotal = limits.getResultsTotal();
queryParams = new QueryParams(
convertSpecLimit(request.getQuality()),
convertSpecLimit(request.getYear()),
convertSpecLimit(request.getSize()),
convertSpecLimit(request.getRank()),
searchSet,
QueryStrategy.valueOf(request.getQueryStrategy()));
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
rankingParams = request.hasParameters() ? request.getParameters() : PrototypeRankingParameters.sensibleDefaults();
}
public long getDataCost() {
return dataCost;
}
}

View File

@@ -1,72 +0,0 @@
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongComparator;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
public final class SearchTerms {
private final LongList advice;
private final LongList excludes;
private final LongList priority;
public static final LongArraySet stopWords = new LongArraySet(
new long[] {
getWordId("a"),
getWordId("an"),
getWordId("the"),
}
);
private final CompiledQueryLong compiledQueryIds;
public SearchTerms(SearchQuery query,
CompiledQueryLong compiledQueryIds)
{
this.excludes = new LongArrayList();
this.priority = new LongArrayList();
this.advice = new LongArrayList();
this.compiledQueryIds = compiledQueryIds;
for (var word : query.searchTermsAdvice) {
advice.add(getWordId(word));
}
for (var word : query.searchTermsExclude) {
excludes.add(getWordId(word));
}
for (var word : query.searchTermsPriority) {
priority.add(getWordId(word));
}
}
public boolean isEmpty() {
return compiledQueryIds.isEmpty();
}
public long[] sortedDistinctIncludes(LongComparator comparator) {
LongList list = new LongArrayList(compiledQueryIds.copyData());
list.sort(comparator);
return list.toLongArray();
}
public LongList excludes() {
return excludes;
}
public LongList advice() {
return advice;
}
public LongList priority() {
return priority;
}
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
}

View File

@@ -1,13 +0,0 @@
package nu.marginalia.index.model;
import nu.marginalia.hash.MurmurHash3_128;
public class SearchTermsUtil {
private static final MurmurHash3_128 hasher = new MurmurHash3_128();
/** Translate the word to a unique id. */
public static long getWordId(String s) {
return hasher.hashKeyword(s);
}
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.results.model.ids;
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.longs.LongArrayList;
@@ -6,7 +6,7 @@ import java.util.Arrays;
import java.util.stream.LongStream;
public final class TermIdList {
private final long[] array;
public final long[] array;
public TermIdList(long[] array) {
this.array = array;

Some files were not shown because too many files have changed in this diff Show More