1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

347 Commits

Author SHA1 Message Date
Viktor Lofgren
d457bb5d44 (index) Fix index actor initialization 2025-09-18 16:06:40 +02:00
Viktor Lofgren
c661ebb619 (refac) Move language-processing into functions
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
2025-09-18 10:30:40 +02:00
Viktor Lofgren
53e744398a Update gitignore to exclude eclipse-generated stuff 2025-09-17 17:14:02 +02:00
Viktor Lofgren
1d71baf3e5 (search) Display search query first in title 2025-09-16 13:16:18 +02:00
Viktor Lofgren
bb5fc0f348 (language) Fix sketchy unicode handling in UnicodeNormalization 2025-09-16 12:15:09 +02:00
Viktor Lofgren
c8f112d040 (lang+search) Clean up LanguageConfiguration initialization and LangCommandD 2025-09-16 11:49:46 +02:00
Viktor Lofgren
ae31bc8498 (lang+search) Clean up LanguageConfiguration initialization and LangCommand 2025-09-16 11:47:15 +02:00
Viktor Lofgren
da5046c3bf (lang) Remove language redirects for languages that are not configured
Passing an invalid &lang= to the query service leads to a harmless but ugly stacktrace.  This change prevents such a request from being formed.
2025-09-16 11:05:31 +02:00
Viktor Lofgren
f67257baf2 (lang) Remove lang:... keyword during LangCommand 2025-09-16 11:01:11 +02:00
Viktor Lofgren
924fb05661 (config) Fix language config pickup 2025-09-16 10:43:27 +02:00
Viktor Lofgren
c231a82062 (search) Lang redirection works better if it's hooked in 2025-09-16 10:40:24 +02:00
Viktor Lofgren
2c1082d7f0 (search) Add notice about the current language selection to the UI 2025-09-16 10:32:13 +02:00
Viktor Lofgren
06947bd026 (search) Add redirect based on lang:-keyword in search query
The change also suppresses the term in the query parser so that it isn't delegated to the index as a keyword.
2025-09-16 10:00:20 +02:00
Viktor Lofgren
519aebd7c6 (process) Make the use of zookeeper based domain coordination optional
The zookeeper based domain coordinator has been a bit unstable and lead to rare deadlocks.  As running multiple instances of the crawler is an unusual configuration, the default behavior that makes the most sense is to disable cross-process coordination and use only local coordination.
2025-09-15 19:13:57 +02:00
Viktor Lofgren
42cc27586e (process) Reduce connection pool stats log spam 2025-09-15 18:51:43 +02:00
Viktor Lofgren
360881fafd (setup) Pull POS tags from control svc on first boot
This commit also removes the old retrieval from setup.sh
2025-09-15 10:05:17 +02:00
Viktor Lofgren
4c6fdf6ebe (language) Make language configuration configurable 2025-09-15 09:54:57 +02:00
Viktor Lofgren
554de21f68 (converter) Disable language keyword 2025-09-15 09:49:04 +02:00
Viktor Lofgren
00194acbfe (search) Add language chooser to UI, clean up search service code 2025-09-13 12:40:42 +02:00
Viktor Lofgren
97dabcefaa (search) Add language chooser to UI, clean up search service code 2025-09-13 12:34:34 +02:00
Viktor Lofgren
cc790644d4 (search) Persist language choice in the search form 2025-09-12 11:14:54 +02:00
Viktor Lofgren
8f893ee6c0 (search) Add basic support for configuring query language to the search service
This is not visible in the UI at this stage, only a query param.
2025-09-11 15:55:09 +02:00
Viktor Lofgren
938721b793 (index) Backwards compatible loading of old words file in index loading 2025-09-11 15:42:31 +02:00
Viktor Lofgren
f68bcefc75 (index) Correct index construction to use the correct files for Fwd index 2025-09-09 11:21:48 +02:00
Viktor Lofgren
0cfd759f85 (deps) Upgrade slop to 0.17 for better skip performance and faster index construction times 2025-09-08 18:02:34 +02:00
Viktor Lofgren
b53002200c (index) SkipListWriter should not be in APPEND mode 2025-09-08 17:55:14 +02:00
Viktor Lofgren
78246b9a63 (index) Fix journal language enumeration 2025-09-08 15:38:26 +02:00
Viktor Lofgren
b552e79927 (language) Make LanguageConfiguration a Singleton to avoid duplicate initializations 2025-09-08 13:24:18 +02:00
Viktor Lofgren
bffc159486 (language) Make unicode normalization configurable 2025-09-08 13:18:58 +02:00
Viktor Lofgren
1432fc87d7 (index) Test languages via integration test 2025-09-06 20:11:41 +02:00
Viktor Lofgren
edd453531e (index) Partition keyword lexicons by language 2025-09-04 17:24:48 +02:00
Viktor Lofgren
096496ada1 (refac) Fold ft-anchor-keywords into converting-process 2025-09-03 13:04:30 +02:00
Viktor Lofgren
8ca6209260 (refac) Fold ft-anchor-keywords into converting-process 2025-09-03 13:03:38 +02:00
Viktor Lofgren
673c65d3c9 (refac) Fold term-frequency-dict into language-processing 2025-09-03 12:59:10 +02:00
Viktor Lofgren
acb9ec7b15 (refac) Consistently use 'languageIsoCode' for the language field 2025-09-03 12:54:18 +02:00
Viktor Lofgren
47079e05db (index) Store language information in the index journal 2025-09-03 12:33:24 +02:00
Viktor Lofgren
c93056e77f (refac) Clean up index code 2025-09-03 09:51:57 +02:00
Viktor Lofgren
6f7530e807 (refac) Clean up index code 2025-09-02 18:53:58 +02:00
Viktor Lofgren
87ce4a1b52 (refac) Clean up index code 2025-09-02 17:52:38 +02:00
Viktor Lofgren
52194cbe7a (refac) Clean up index code 2025-09-02 17:44:42 +02:00
Viktor Lofgren
fd1ac03c78 (refac) Clean up index code 2025-09-02 17:30:19 +02:00
Viktor Lofgren
5e5b86efb4 (refac) Clean up index code 2025-09-02 17:24:30 +02:00
Viktor Lofgren
f332ec6191 (refac) Clean up index code 2025-09-02 13:13:10 +02:00
Viktor Lofgren
c25c1af437 (refac) Clean up index code 2025-09-02 13:04:05 +02:00
Viktor Lofgren
eb0c911b45 (refac) Clean up index code 2025-09-02 12:50:07 +02:00
Viktor Lofgren
1979870ce4 (refac) Merge index-forward, index-reverse, index/query into index
The project has too many submodules, and it's a bit of a headache to navigate.
2025-09-02 12:30:42 +02:00
Viktor Lofgren
0ba2ea38e1 (index) Move reverse index into a distinct package 2025-09-02 11:59:56 +02:00
Viktor Lofgren
d6cfbceeea (index) Use a configurable hasher in the index 2025-09-01 13:44:28 +02:00
Viktor Lofgren
e369d200cc (refac) Simplify index data model by merging SearchParameters, SearchTerms and ResultRankingContext into a new object called SearchContext
The previous design was difficult to reason about as similar data was stored in several places, and different functions wanted different nearly identical (but not fully identical) context objects.

This is in preparation for making the keyword hash function configurable, as we want focus all the code that hashes keywords into one place.
2025-09-01 13:17:11 +02:00
Viktor Lofgren
946d64c8da (index) Make hash algorithm selection configurable, writer-side 2025-09-01 12:03:01 +02:00
Viktor Lofgren
42f043a60f (API) Add language parameter to the APIs 2025-09-01 09:33:39 +02:00
Viktor Lofgren
2f3950e0d5 (language) Roll KeywordExtractor into LanguageDefinition 2025-08-29 10:55:48 +02:00
Viktor Lofgren
61d803869e (language) Add support for languages with no POS-tagging
Clean up previous commit a bit.
2025-08-29 10:55:48 +02:00
Viktor Lofgren
df6434d177 (language) Add support for languages with no POS-tagging
This disables a lot of the smart keyword extraction,
which is mostly a crutch for helping English and similar
large languages to find relevant search results.

Smaller languages where a POS-tag model may not be available,
are probably fine with this disabled, as the search engine can
likely just rawdog the entire results list.
2025-08-29 10:55:48 +02:00
Viktor Lofgren
59519ed7c4 (language) Adjust languages.xml 2025-08-29 10:55:47 +02:00
Viktor Lofgren
874fc2d250 (language) Remove debug logging junk 2025-08-29 10:55:47 +02:00
Viktor Lofgren
69e8ec0eef (language) Fix subject keywords matcher with better rules and correct logic 2025-08-29 10:55:47 +02:00
Viktor Lofgren
a7eb5f54e6 (language) Clean up PosPattern, add tests 2025-08-29 10:55:47 +02:00
Viktor Lofgren
b29ba3e228 (language) Integrate new configurable POS patterns with keyword matchers 2025-08-29 10:55:47 +02:00
Viktor Lofgren
5fa5029c60 (language) Clean up UI 2025-08-29 10:55:47 +02:00
Viktor Lofgren
4257f60f00 (keywords) Fix logic error causing misidentification of some keywords 2025-08-29 10:55:47 +02:00
Viktor Lofgren
ce221d3a0e (language) Integrate old keyword extraction logic with new test tool 2025-08-29 10:55:47 +02:00
Viktor Lofgren
f0741142a3 (refac) Move keyword extraction into language processing 2025-08-29 10:55:47 +02:00
Viktor Lofgren
0899e4d895 (language) First version of the language processing debug tool 2025-08-29 10:55:47 +02:00
Viktor Lofgren
bbf7c5a1cb (language) Fix RDRPosTagger back to working order and integrate with SentenceExtractor 2025-08-29 10:55:47 +02:00
Viktor Lofgren
686a40e69b (language) Update modelling 2025-08-29 10:55:47 +02:00
Viktor Lofgren
8af254f44f (language) Parse PosPattern tags 2025-08-29 10:55:47 +02:00
Viktor Lofgren
2c21bd9287 (language) Add logging for unknown POS tags in PosPattern 2025-08-29 10:55:47 +02:00
Viktor Lofgren
f9645e2f00 (language) Enhance PosPattern to support wildcard variants in pattern matching 2025-08-29 10:55:47 +02:00
Viktor Lofgren
81e311b558 (language) POS-patterns WIP 2025-08-29 10:55:47 +02:00
Viktor Lofgren
507c09146a (language) Add support for downloadable resources, parsing POS tag configuration tags 2025-08-29 10:55:47 +02:00
Viktor Lofgren
f682425594 (language) Basic test for LanguageConfiguration 2025-08-29 10:55:47 +02:00
Viktor Lofgren
de67006c4f (language) Initial integration of new language configuration utility 2025-08-29 10:55:47 +02:00
Viktor Lofgren
eea32bb7b4 (language) Very basic language.xml loading off classpath 2025-08-29 10:55:47 +02:00
Viktor Lofgren
e976940a4e (config) Move slf4j config files to common:config 2025-08-29 10:55:47 +02:00
Viktor Lofgren
b564b33028 (language) Initial embryo for language configuration 2025-08-29 10:55:47 +02:00
Viktor Lofgren
1cca16a58e (language) Simplify language filters 2025-08-29 10:55:47 +02:00
Viktor Lofgren
70b4ed6d81 (ldb) Pipe language information into LDB database 2025-08-29 10:55:47 +02:00
Viktor Lofgren
45dc6412c1 (converter) Add language column to slop tables 2025-08-29 10:55:47 +02:00
Viktor Lofgren
b3b95edcb5 (converter) Bypass some of the grammar processing in the keyword extraction depending on language selection 2025-08-29 10:55:47 +02:00
Viktor Lofgren
338d300e1a (converter) Clean up spans-handling
This code was unnecessarily difficult to follow with repeated packing and re-packing of the same data.
2025-08-29 10:55:47 +02:00
Viktor Lofgren
fa685bf1f4 (converter) Add Language field to ProcessedDocumentDetails 2025-08-29 10:55:47 +02:00
Viktor Lofgren
d79a3e2b2a (converter) Tag documents by language in the index as a keyword 2025-08-29 10:55:47 +02:00
Viktor Lofgren
854382b2be (language-filter) Experimentally permit Swedish results to pass through the language filter 2025-08-29 10:55:47 +02:00
Viktor Lofgren
8710adbc2a (build) Reduce log noise during tests 2025-08-29 10:55:32 +02:00
Viktor Lofgren
acdf7b4785 (build) Add test-logger plugin to get better feedback during test execution 2025-08-29 10:41:35 +02:00
Viktor Lofgren
b5d27c1406 (search) Improve unicode support in displayTitle and displaySummary 2025-08-23 13:59:41 +02:00
Viktor Lofgren
55eb7dc116 (search) Improve unicode support in displayTitle and displaySummary 2025-08-23 13:57:51 +02:00
Viktor Lofgren
f0e8bc8baf (search) Improve unicode support in displayTitle and displaySummary 2025-08-23 13:56:19 +02:00
Viktor Lofgren
91a6ad2337 (search) Improve unicode support in displayTitle and displaySummary 2025-08-23 13:54:48 +02:00
Viktor Lofgren
9a182b9ddb (search) Use ADVERTISEMENT flag instead of TRACKING_ADVERTISEMENT when choosing to flag a result as having ads 2025-08-21 13:08:25 +02:00
Viktor Lofgren
fefbcf15ce (site) Make discord link point to chat.marginalia.nu and let nginx deal with figuring out which discord link to redirect to 2025-08-21 12:46:37 +02:00
Viktor Lofgren
9a789bf62d (array) Fix broken test 2025-08-18 09:10:58 +02:00
Viktor Lofgren
0525303b68 (index) Add upper limit to span lengths
Apparently outliers exist that are larger than SHORT_MAX.  This is probably not interesting, so we'll truncate at 8192 for now.

Adding logging statement to get more information about which spans these are so we can address the root cause down the line.
2025-08-17 08:44:57 +02:00
Viktor Lofgren
6953d65de5 (native) Register fixed fd:s for a nice io_uring speed boost 2025-08-16 13:48:11 +02:00
Viktor Lofgren
a7a18ced2e (native) Register fixed fd:s for a nice io_uring speed boost 2025-08-16 13:46:39 +02:00
Viktor Lofgren
7c94c941b2 (build) Correct rare scenario where root blocks could be generated with a negative size 2025-08-16 11:27:36 +02:00
Viktor Lofgren
ea99b62356 (build) Fix missing junit engine version 2025-08-16 11:01:32 +02:00
Viktor Lofgren
3dc21d34d8 (skiplist) Fix stability of getData fuzz test 2025-08-15 09:17:48 +02:00
Viktor Lofgren
51912e0176 (index) Tweak default values for IndexQueryExecution 2025-08-15 08:07:00 +02:00
Viktor Lofgren
de1b4d5372 (index) Make metrics make more sense by normalizing them by query budget 2025-08-15 03:16:22 +02:00
Viktor Lofgren
50ac926060 (index) Make metrics make more sense by normalizing them by query budget 2025-08-15 03:11:57 +02:00
Viktor Lofgren
d711ee75b5 (index) Add performance metrics 2025-08-15 00:48:52 +02:00
Viktor Lofgren
291ff0c4de (deps) Upgrade crawler commons to fix robots.txt-parser bug 2025-08-15 00:13:15 +02:00
Viktor
2fd2710355 Merge pull request #218 from MarginaliaSearch/o_direct_index
Replace document index btrees with a block based skiplist, get rid of mmap use O_DIRECT pread instead, use io_uring for positions reads
2025-08-14 23:57:09 +02:00
Viktor Lofgren
e3b957063d (native) Add fallbacks and configuration options for building on systems lacking liburing 2025-08-14 23:36:13 +02:00
Viktor Lofgren
aee262e5f6 (index) Safeguard against arena-leaks during exceptions
The GC would catch these eventually, but it's nice to clean up ourselves in a timely manner.
2025-08-14 19:28:31 +02:00
Viktor Lofgren
4a98a3c711 (skiplist) Move to a separate directory instead of in the btree module 2025-08-14 01:09:46 +02:00
Viktor Lofgren
68f52ca350 (test) Fix tests that works on my machine (TM) 2025-08-14 00:59:58 +02:00
Viktor Lofgren
2a2d951c2f (index) Fix unhinged default values for index.preparationThreads 2025-08-14 00:54:35 +02:00
Viktor Lofgren
379a1be074 (index) Add better timeout handling in UringQueue, fix slow memory leak on timeout exception 2025-08-14 00:52:50 +02:00
Viktor Lofgren
827aadafcd (uring) Reintroduce auto-slicing of excessively long read batches 2025-08-13 14:33:35 +02:00
Viktor Lofgren
aa7679d6ce (pool) Fix bug in exceptionally rare edge case leading to incorrect reads 2025-08-13 14:28:50 +02:00
Viktor Lofgren
6fe6de766d (pool) Fix SegmentMemoryPage storage 2025-08-13 13:17:14 +02:00
Viktor Lofgren
4245ac4c07 (doc) Update docs to reflect that we now need io_uring 2025-08-12 15:12:54 +02:00
Viktor Lofgren
1c49a0f5ad (index) Add system properties for toggling O_DIRECT mode for positions and spans 2025-08-12 15:11:13 +02:00
Viktor Lofgren
9a6e5f646d (docker) Add security_opt: seccomp:unconfined to docker-compose files
This is needed to access io_uring via docker.
2025-08-12 15:10:26 +02:00
Viktor Lofgren
fa92994a31 (uring) Fall back to simple I/O planning behavior when buffered mode is selected in UringFileReader 2025-08-11 23:44:38 +02:00
Viktor Lofgren
bc49406881 (build) Compatibility hack debian server 2025-08-11 23:26:53 +02:00
Viktor Lofgren
90325be447 (minor) Fix comments 2025-08-11 23:19:53 +02:00
Viktor Lofgren
dc89587af3 (index) Improve disk locality of the positions data 2025-08-11 21:17:12 +02:00
Viktor Lofgren
7b552afd6b (index) Improve disk locality of the positions data 2025-08-11 20:59:11 +02:00
Viktor Lofgren
73557edc67 (index) Improve disk locality of the positions data 2025-08-11 20:57:32 +02:00
Viktor Lofgren
83919e448a (index) Use O_DIRECT buffered reads for spans 2025-08-11 18:04:25 +02:00
Viktor Lofgren
6f5b75b84d (cleanup) Remove accidentally committed print stmt 2025-08-11 18:04:25 +02:00
Viktor Lofgren
db315e2813 (index) Use O_DIRECT position reads 2025-08-11 18:04:25 +02:00
Viktor Lofgren
e9977e08b7 (index) Block-align positions data
This will make reads more efficient, and possibly pave way for O_DIRECT reads of this data
2025-08-11 14:36:45 +02:00
Viktor Lofgren
1df3757e5f (native) Clean up io_uring code and check in execution queue, currently unused but nifty 2025-08-11 13:54:05 +02:00
Viktor Lofgren
ca283f9684 (native) Clean up native helpers and break them into their own library 2025-08-10 20:55:34 +02:00
Viktor Lofgren
85360e61b2 (index) Grow span writer buffer size
Apparently outlier spans can grow considerably large.
2025-08-10 17:20:38 +02:00
Viktor Lofgren
e2ccff21bc (index) Wait until ranking is finished in query execution 2025-08-09 23:40:30 +02:00
Viktor Lofgren
c5b5b0c699 (index) Permit fast termination of rejection filter execution 2025-08-09 23:36:59 +02:00
Viktor Lofgren
9a65946e22 (uring) Reduce queue size to 2048 to avoid ENOMEM on systems with default ulimits 2025-08-09 20:41:24 +02:00
Viktor Lofgren
1d2ab21e27 (index) Aggregate termdata reads into a single io_uring operation instead of one for each term 2025-08-09 17:43:18 +02:00
Viktor Lofgren
0610cc19ad (index) Fix double close errors 2025-08-09 17:05:38 +02:00
Viktor Lofgren
a676306a7f (skiplist) Fix bugs in seek operations 2025-08-09 17:00:27 +02:00
Viktor Lofgren
8d68cd14fb (skiplist) Even more aggressive forward pointers 2025-08-09 16:11:41 +02:00
Viktor Lofgren
4773c5a52b (index) Backport some changes made during performance evaluations 2025-08-09 15:19:41 +02:00
Viktor Lofgren
74bd562ae4 (index) Move I/O to separate threads to hopefully reduce contention a bit 2025-08-09 15:19:41 +02:00
Viktor Lofgren
c9751287b0 (index) Boost the buffer size used in PrioIndexEntrySource 2025-08-09 01:46:12 +02:00
Viktor Lofgren
5da24e3fc4 (index) Segregate full and priority query ranking 2025-08-09 00:39:31 +02:00
Viktor Lofgren
20a4e86eec (index) Use a confined arena in IndexResultRankingService 2025-08-08 22:08:35 +02:00
Viktor Lofgren
477a184948 (experiment) Allow early termination of include conditions in lookups 2025-08-08 19:12:54 +02:00
Viktor Lofgren
8940ce99db (perf) More statistics in perf testi 2025-08-08 18:57:25 +02:00
Viktor Lofgren
0ac0fa4dca (perf) More statistics in perf testi 2025-08-08 18:56:17 +02:00
Viktor Lofgren
942f15ef14 (skiplist) Use a linear-quadratic forward pointer scheme instead of an exponential 2025-08-08 16:57:15 +02:00
Viktor Lofgren
f668f33d5b (index) Tweaks and optimizations 2025-08-08 15:32:23 +02:00
Viktor Lofgren
6789975cd2 (index) Tweaks and optimizations 2025-08-08 15:30:48 +02:00
Viktor Lofgren
c3ba608776 (index) Split up evaluation tasks 2025-08-08 15:20:33 +02:00
Viktor Lofgren
733d2687fe (skiplist) Roll back the design change that segregated the values associated with documents into a separate file 2025-08-08 14:45:11 +02:00
Viktor Lofgren
f6daac8ed0 (index) MADVISE_RANDOM the index btrees 2025-08-07 21:14:28 +02:00
Viktor Lofgren
c2eeee4a06 (uring) Disable result set combination 2025-08-07 21:13:30 +02:00
Viktor Lofgren
3b0c701df4 (uring) Update uring timeout threshold 2025-08-07 20:13:25 +02:00
Viktor Lofgren
c6fb2db43b (index) Use a more SLA-aware execution scheduler 2025-08-07 20:13:15 +02:00
Viktor Lofgren
9bc8fe05ae (skiplist) Clean up search logic 2025-08-07 19:35:25 +02:00
Viktor Lofgren
440ffcf6f8 (skiplist) Fix bug in intersection-like algorithms 2025-08-07 02:18:14 +02:00
Viktor Lofgren
b07709cc72 (native) Disable expensive debug checks from uring code 2025-08-06 21:05:28 +02:00
Viktor Lofgren
9a6acdcbe0 (skiplist) Tag slow fuzz test as "slow" 2025-08-06 20:59:52 +02:00
Viktor Lofgren
23b9b0bf1b (index) Parametrize skip list block size and buffer pool sizes 2025-08-06 20:59:33 +02:00
Viktor Lofgren
749c8ed954 (pool) Correct buffer pool alignment 2025-08-06 20:56:34 +02:00
Viktor Lofgren
9f4b6939ca (skiplist) Fix condition for truncated block writing 2025-08-06 16:25:53 +02:00
Viktor Lofgren
1d08e44e8d (uring) Fadvise random access for uring buffered reads 2025-08-06 15:54:24 +02:00
Viktor Lofgren
fc2e156e78 (skiplist) Ensure docs file is a multiple BLOCK_SIZE bytes 2025-08-06 15:13:32 +02:00
Viktor Lofgren
5e68a89e9f (index) Improve error handling 2025-08-06 15:05:16 +02:00
Viktor Lofgren
d380661307 (index) Improve error handling 2025-08-06 14:31:06 +02:00
Viktor Lofgren
cccdf5c329 (pool) Check interrupt status in PoolLru's reclamation thread 2025-08-06 13:26:00 +02:00
Viktor Lofgren
f085b4ea12 (skiplist) Fix tests 2025-08-06 13:24:14 +02:00
Viktor Lofgren
e208f7d3ba (skiplist) Code clean up an added validation 2025-08-06 12:55:04 +02:00
Viktor Lofgren
b577085cb2 (pool) Use one contiguous memory allocation to encourage a HugePage allocation and reduce TLB thrashing 2025-08-06 12:49:46 +02:00
Viktor Lofgren
b9240476f6 (pool) Use one contiguous memory allocation to encourage a HugePage allocation and reduce TLB thrashing 2025-08-06 12:48:14 +02:00
Viktor Lofgren
8f50f86d0b (index) Fix error handling 2025-08-05 22:19:23 +02:00
Viktor Lofgren
e3b7ead7a9 (skiplist) Fix aggessive forward pointering 2025-08-05 20:47:38 +02:00
Viktor Lofgren
9a845ba604 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 19:10:58 +02:00
Viktor Lofgren
b9381f1603 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 17:35:13 +02:00
Viktor Lofgren
6a60127267 (skiplist) EXPERIMENTAL - Store data in a separate file from document ids 2025-08-05 16:54:39 +02:00
Viktor Lofgren
e8ffcfbb19 (skiplist) Correct binary search implementation, fix intersection logic 2025-08-04 14:49:09 +02:00
Viktor Lofgren
caf0850f81 (index) Clean up code 2025-08-04 00:12:35 +02:00
Viktor Lofgren
62e3bb675e (btree) Remove O_DIRECT btree implementation 2025-08-03 23:43:31 +02:00
Viktor Lofgren
4dc3e7da7a (perf) Remove warmup from perf test, it's not doing much 2025-08-03 21:19:54 +02:00
Viktor Lofgren
92b09883ec (index) Switch from AIO to io_uring
Turns AIO is just bad especially with buffered I/O, io_uring performs strictly better in this scenario.
2025-08-03 21:19:54 +02:00
Viktor Lofgren
87082b4ef8 (index) Use AIO for reading spans and positions
This performs slightly worse in benchmarks, but that's likely caused by hitting the page cache.

AIO will tend to perform better when we see cache misses, which is the expected case in production on real-world data.
2025-08-03 21:19:54 +02:00
Viktor Lofgren
84d3f6087f (skiplist) Parametrize skip list block size, increase to 4K pages 2025-08-03 21:19:54 +02:00
Viktor Lofgren
f93ba371a5 (pool) Fix the LRU to not deadlock and be shit 2025-08-03 21:19:54 +02:00
Viktor Lofgren
5eec27c68d (pool) Fix for 32 bit rollover in clockHand for LRU 2025-08-03 21:19:54 +02:00
Viktor Lofgren
ab01576f91 (pool) Use one global buffer pool instead of many small ones, improved LRU with gclock reclamation, skip list optimization 2025-08-03 21:19:54 +02:00
Viktor Lofgren
054e5ccf44 (pool) Testing synchronized to see if I can find the deadlock 2025-08-03 21:19:54 +02:00
Viktor Lofgren
4351ea5128 (pool) Fix buffer leak 2025-08-03 21:19:54 +02:00
Viktor Lofgren
49cfa3a5e9 (pool) Decrease LQB size 2025-08-03 21:19:54 +02:00
Viktor Lofgren
683854b23f (pool) Fix logging 2025-08-03 21:19:54 +02:00
Viktor Lofgren
e880fa8945 (pool) Simplify locking in PoolLru 2025-08-03 21:19:54 +02:00
Viktor Lofgren
2482dc572e (pool) Grow free queue size 2025-08-03 21:19:54 +02:00
Viktor Lofgren
4589f11898 (pool) More stats 2025-08-03 21:19:54 +02:00
Viktor Lofgren
e43b6e610b (pool) Adjust pool reclamation strategy 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4772117a1f (skiplist) First stab at a skiplist replacement for btrees in the documents lists 2025-08-03 21:19:53 +02:00
Viktor Lofgren
3fc7ea521c (pool) Remove readahead and simplify the code 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4372f5af03 (pool) More performant LRU pool + better instructions queue 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4ad89b6c75 (pool) More performant LRU pool 2025-08-03 21:19:53 +02:00
Viktor Lofgren
ad0519e031 (index) Optimizations 2025-08-03 21:19:53 +02:00
Viktor Lofgren
596ece1230 (pool) Fix deadlock during pool starvation 2025-08-03 21:19:53 +02:00
Viktor Lofgren
07b6e1585b (pool) Bump pool sizes 2025-08-03 21:19:53 +02:00
Viktor Lofgren
cb5e2778eb (pool) Align the buffers with 512b 2025-08-03 21:19:53 +02:00
Viktor Lofgren
8f5ea7896c (btree) More debug information on numEntries = 0 scenario 2025-08-03 21:19:53 +02:00
Viktor Lofgren
76c398e0b1 (index) Fix lingering issues with previous optimizations 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4a94f04a8d (btree) Debug logging 2025-08-03 21:19:53 +02:00
Viktor Lofgren
df72f670d4 (btree) Fix queryData 2025-08-03 21:19:53 +02:00
Viktor Lofgren
eaa22c2f5a (*) Logging 2025-08-03 21:19:53 +02:00
Viktor Lofgren
7be173aeca (pool) Only dump statistics if they say anything 2025-08-03 21:19:53 +02:00
Viktor Lofgren
36685bdca7 (btree) Fix retain implementation 2025-08-03 21:19:53 +02:00
Viktor Lofgren
ad04057609 (btree) Add short circuits when retain/rejecting on an empty tree 2025-08-03 21:19:53 +02:00
Viktor Lofgren
eb76ae22e2 (perf) Use lqb size 512 in perf test 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4b858ab341 (btree) Cache retain/reject reads 2025-08-03 21:19:53 +02:00
Viktor Lofgren
c6e3c8aa3b (index) Focus pools to try to increase reuse 2025-08-03 21:19:53 +02:00
Viktor Lofgren
9128d3907c (index) Periodically dump buffer metrics 2025-08-03 21:19:53 +02:00
Viktor Lofgren
4ef16d13d4 (index) O_DIRECT based buffer pool for index reads 2025-07-30 15:04:23 +02:00
Viktor Lofgren
838a5626ec (index) Reduce query buffer size 2025-07-27 21:42:04 +02:00
Viktor Lofgren
6b426209c7 (index) Restore threshold for work stealing in query execution 2025-07-27 21:41:46 +02:00
Viktor Lofgren
452b5731d9 (index) Lower threshold for work stealing in query execution 2025-07-27 21:35:11 +02:00
Viktor Lofgren
c91cf49630 (search) Disable scribe.rip substitution
It does not appear to work well
2025-07-27 19:40:58 +02:00
Viktor Lofgren
8503030f18 (search) Fix rare exception in scribe.rip substitution 2025-07-27 19:38:52 +02:00
Viktor Lofgren
744f7d3ef7 (search) Fix rare exception in scribe.rip substitution 2025-07-27 19:34:03 +02:00
Viktor Lofgren
215e12afe9 (index) Shrink query buffer size 2025-07-27 17:33:46 +02:00
Viktor Lofgren
2716bce918 (index) Adjust timeout logic for evaluation 2025-07-27 17:28:34 +02:00
Viktor Lofgren
caf2e6fbb7 (index) Adjust timeout logic for evaluation 2025-07-27 17:27:07 +02:00
Viktor Lofgren
233f0acfb1 (index) Further reduce query buffer size 2025-07-27 17:13:08 +02:00
Viktor Lofgren
e3a4ff02e9 (index) Abandon ongoing evaluation tasks if time is up 2025-07-27 17:04:01 +02:00
Viktor Lofgren
c786283ae1 (index) Reduce quer buffer size 2025-07-27 16:57:55 +02:00
Viktor Lofgren
a3f65ac0e0 (deploy) Trigger index deployment 2025-07-27 16:50:23 +02:00
Viktor
aba1a32af0 Merge pull request #217 from MarginaliaSearch/uncompressed-spans-file
Index optimizations
2025-07-27 16:49:27 +02:00
Viktor Lofgren
c9c442345b (perf) Change execution test to use processing rate instead of count 2025-07-27 16:39:51 +02:00
Viktor Lofgren
2e126ba30e (perf) Change execution test to use processing rate instead of count 2025-07-27 16:37:20 +02:00
Viktor Lofgren
2087985f49 (index) Implement work stealing in IndexQueryExecution as a better approach to backpressure 2025-07-27 16:29:57 +02:00
Viktor Lofgren
2b13ebd18b (index) Tweak evaluation backlog handling 2025-07-27 16:08:16 +02:00
Viktor Lofgren
6d92c125fe (perf) Fix perf test 2025-07-27 15:50:28 +02:00
Viktor Lofgren
f638cfa39a (index) Avoid possibility of negative timeout 2025-07-27 15:39:12 +02:00
Viktor Lofgren
89447c12af (index) Avoid possibility of negative timeout 2025-07-27 15:24:47 +02:00
Viktor Lofgren
c71fc46f04 (perf) Update perf test with execution scenario 2025-07-27 15:22:07 +02:00
Viktor Lofgren
f96874d828 (sequence) Implement a largestValue abort condition for minDistance()
This is something like 3500% faster in certain common scenarios
2025-07-27 15:05:50 +02:00
Viktor Lofgren
583a84d5a0 (index) Clean up of the index query execution logic 2025-07-27 15:05:50 +02:00
Viktor Lofgren
f65b946448 (index) Clean up code 2025-07-27 15:05:50 +02:00
Viktor Lofgren
3682815855 (index) Optimize sequence intersection for the n=1 case 2025-07-26 19:14:32 +02:00
Viktor Lofgren
3a94357660 (index) Perf test tool (WIP!) 2025-07-26 11:49:33 +02:00
Viktor Lofgren
673b0d3de1 (index) Perf test tool (WIP!) 2025-07-26 11:49:31 +02:00
Viktor Lofgren
ea942bc664 (spans) Add signature to the footer of the spans file, including a version byte so we can detect whether ot use the old or new decoding logic 2025-07-25 12:07:18 +02:00
Viktor Lofgren
7ed5083c54 (index) Don't split results into chunks 2025-07-25 11:45:07 +02:00
Viktor Lofgren
08bb2c097b (refac) Clean up the data model used in the index service 2025-07-25 10:54:07 +02:00
Viktor Lofgren
495fb325be (sequence) Correct sequence intersection bug introduced in optimizations 2025-07-25 10:48:33 +02:00
Viktor Lofgren
05c25bbaec (chore) Clean up 2025-07-24 23:43:27 +02:00
Viktor Lofgren
2a028b84f3 (chore) Clean up 2025-07-24 20:12:56 +02:00
Viktor Lofgren
a091a23623 (ranking) Remove unnecessary metadata retrievals 2025-07-24 20:08:09 +02:00
Viktor Lofgren
e8897acb45 (ranking) Remove unnecessary metadata retrievals 2025-07-24 20:05:39 +02:00
Viktor Lofgren
b89ffcf2be (index) Evaluate hash based idx mapping in ForwardIndexReader 2025-07-24 19:47:27 +02:00
Viktor Lofgren
dbcc9055b0 (index) Evaluate using MinMaxPriorityQueue as guts of ResultPriorityQueue 2025-07-24 19:31:51 +02:00
Viktor Lofgren
d9740557f4 (sequence) Optimize intersection logic with a fast abort condition 2025-07-24 19:04:10 +02:00
Viktor Lofgren
0d6cd015fd (index) Evaluate reading all spans at once 2025-07-24 18:34:11 +02:00
Viktor Lofgren
c6034efcc8 (index) Cache value of bitset cardinality for speed 2025-07-24 17:24:55 +02:00
Viktor Lofgren
76068014ad (index) More spans optimizations 2025-07-24 15:03:43 +02:00
Viktor Lofgren
1c3ed67127 (index) Byte align document spans 2025-07-24 14:06:14 +02:00
Viktor Lofgren
fc0cb6bd9a (index) Reserve a larger size for IntArrayList in SeqenceOperations.findIntersections 2025-07-24 14:03:44 +02:00
Viktor Lofgren
c2601bac78 (converter) Remove unnecessary allocation of a 16 KB byte buffer 2025-07-24 13:25:37 +02:00
Viktor Lofgren
f5641b72e9 (index) Fix broken test 2025-07-24 13:21:05 +02:00
Viktor Lofgren
36efe2e219 (index) Optimize PositionsFileReader for concurrent reads
In benchmarks this is roughly twice as fast as the previous approach.  Main caveat being we need multiple file descriptors to avoid read instruction serialization by the kernel.  This is undesirable since the reads are complete scattershot and can't be reordered by the kernel in a way that optimizes anything.
2025-07-24 13:20:54 +02:00
Viktor Lofgren
983fe3829e (spans) Evaluate uncompressed spans files
Span decompression appears to be somewhat of a performance bottleneck.  This change removes compression of the spans file.  The spans are still compressed in transit between the converter and index constructor at this stage.  The change is intentionally kept small to just evaluate the performance implications, change in file sizes, etc.
2025-07-23 18:10:41 +02:00
Viktor Lofgren
668c87aa86 (ssr) Drop Executor from SSR as it no longer exists 2025-07-23 13:55:41 +02:00
Viktor Lofgren
9d3f9adb05 Force redeploy of everything 2025-07-23 13:36:02 +02:00
Viktor
a43a1773f1 Merge pull request #216 from MarginaliaSearch/deprecate-executor
Architecture: Remove the separate executor service and roll it into the index service.
2025-07-23 13:32:42 +02:00
Viktor Lofgren
1e7a3a3c4f (docs) Update docs to reflect the change 2025-07-23 13:18:23 +02:00
Viktor Lofgren
62b696b1c3 (architecture) Remove the separate executor service and merge it into the index service
The primary motivation for this is that in production, the large number of partitioned services has lead to an intermittent exhaustion of available database connections, as each service has a connection pool.

The decision to have a separate executor service dates back from when the index service was very slow to start, and the executor didn't always spin off its memory-hungry tasks into separate processes, which meant the executor would sometimes OOM and crash, and it was undesirable to bring the index down with it.
2025-07-23 12:57:13 +02:00
Viktor Lofgren
f1a900f383 (search) Clean up front page mobile design a bit 2025-07-23 12:20:40 +02:00
Viktor Lofgren
700364b86d (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 15:08:20 +02:00
Viktor Lofgren
7e725ddaed (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 14:41:59 +02:00
Viktor Lofgren
120209e138 (sample) Diagnosing compression errors 2025-07-21 14:34:08 +02:00
Viktor Lofgren
a771a5b6ce (sample) Test different approach to decoding 2025-07-21 14:19:01 +02:00
Viktor Lofgren
dac5b54128 (sample) Better logging for sample errors 2025-07-21 14:03:58 +02:00
Viktor Lofgren
6cfb143c15 (sample) Compress sample HTML data and introduce new API for only getting requests 2025-07-21 13:55:25 +02:00
Viktor Lofgren
23c818281b (converter) Reduce DomSample logging for NOT_FOUND 2025-07-21 13:37:55 +02:00
Viktor Lofgren
8aad253cf6 (converter) Add more logging around dom sample data retrieval errors 2025-07-21 13:26:38 +02:00
Viktor Lofgren
556d7af9dc Reapply "(grpc) Use grpc-netty instead of grpc-netty-shaded"
This reverts commit b7a5219ed3.
2025-07-21 13:23:32 +02:00
Viktor Lofgren
b7a5219ed3 Revert "(grpc) Use grpc-netty instead of grpc-netty-shaded"
Reverting this change to see if it's the cause of some instability issues observed.
2025-07-21 13:10:41 +02:00
Viktor Lofgren
a23ec521fe (converter) Ensure features is mutable on DetailsWithWords as this is assumed later 2025-07-21 12:50:04 +02:00
Viktor Lofgren
fff3babc6d (classier) Add rule for */pixel.gif as likely tracking pixels 2025-07-21 12:35:57 +02:00
Viktor Lofgren
b2bfb8217c (special) Trigger CD run 2025-07-21 12:28:24 +02:00
Viktor
3b2ac414dc Merge pull request #210 from MarginaliaSearch/ads-fingerprinting
Implement advertisement and popover identification based on DOM sample data
2025-07-21 12:25:31 +02:00
Viktor Lofgren
0ba6515a01 (converter) Ensure converter works well even when dom sample data is unavailable 2025-07-21 12:11:17 +02:00
Viktor Lofgren
16c6b0f151 (search) Add link to new discord community 2025-07-20 20:54:42 +02:00
Viktor Lofgren
e998692900 (converter) Ensure converter works well even when dom sample data is unavailable 2025-07-20 19:24:40 +02:00
Viktor Lofgren
eeb1695a87 (search) Clean up dead code 2025-07-20 19:15:01 +02:00
Viktor Lofgren
a0ab910940 (search) Clean up code 2025-07-20 19:14:13 +02:00
Viktor Lofgren
b9f31048d7 (search) Clean up overlong class names 2025-07-20 19:13:04 +02:00
Viktor Lofgren
12c304289a (grpc) Use grpc-netty instead of grpc-netty-shaded
This will help reduce runaway thread pool sizes
2025-07-20 17:36:25 +02:00
Viktor Lofgren
6ee01dabea (search) Drastically reduce worker thread count in search-service 2025-07-20 17:16:58 +02:00
Viktor Lofgren
1b80e282a7 (search) Drastically reduce worker thread count in search-service 2025-07-20 16:58:33 +02:00
Viktor Lofgren
a65d18f1d1 (client) Use virtual threads in a few more clients 2025-07-20 14:10:02 +02:00
Viktor Lofgren
90a1ff220b (ui) Clean up UI 2025-07-19 18:41:36 +02:00
Viktor Lofgren
d6c7092335 (classifier) More rules 2025-07-19 18:41:36 +02:00
Viktor Lofgren
b716333856 (classifier) Match regexes against the path + query only, as well as the full URL 2025-07-19 18:41:36 +02:00
Viktor Lofgren
b504b8482c (classifier) Add new tracker 2025-07-19 18:41:36 +02:00
Viktor Lofgren
80da1e9ad1 (ui) UI cleanup 2025-07-19 18:41:36 +02:00
Viktor Lofgren
d3f744a441 (ui) Add traffic report to overview menu 2025-07-19 18:41:36 +02:00
Viktor Lofgren
60fb539875 (ui) Add explanatory blurb 2025-07-19 18:41:35 +02:00
Viktor Lofgren
7f5094fedf (ui) Clean up UI 2025-07-19 18:41:35 +02:00
Viktor Lofgren
45066636a5 (classifier) Add classification for domains that make 3rd party requests 2025-07-19 18:41:35 +02:00
Viktor Lofgren
e2d6898c51 (search) Change tag colors to more pleasant ones 2025-07-19 18:41:35 +02:00
Viktor Lofgren
58ef767b94 (search) Improve traffic report UI 2025-07-19 18:41:35 +02:00
Viktor Lofgren
f9f268c67a (grpc) Improve error handling 2025-07-19 18:41:35 +02:00
Viktor Lofgren
f44c2bdee9 (chore) Cleanup 2025-07-19 18:41:35 +02:00
Viktor Lofgren
6fdf477c18 (refac) Move DomSampleClassification to top level 2025-07-19 18:41:35 +02:00
Viktor Lofgren
6b6e455e3f (classifier) Clean up xml 2025-07-19 18:41:35 +02:00
Viktor Lofgren
a3a126540c (classifier) Add README.md 2025-07-19 18:41:35 +02:00
Viktor Lofgren
842b19da40 (search) Mobile layout + phrasing 2025-07-19 18:41:35 +02:00
Viktor Lofgren
2a30e93bf0 (classifier) 2025-07-19 18:41:34 +02:00
Viktor Lofgren
3d998f12c0 (search) Use display name where possible 2025-07-19 18:41:34 +02:00
Viktor Lofgren
cbccc2ac23 (classification) Add /ccm/collect as an ads-related request 2025-07-19 18:41:34 +02:00
Viktor Lofgren
2cfc23f9b7 (search) Fix layout for mobile 2025-07-18 19:06:23 +02:00
Viktor Lofgren
88fe394cdb (request-classifier) Add rule for /pagead/ 2025-07-18 19:01:33 +02:00
Viktor Lofgren
f30fcebd4f Remove dead code 2025-07-18 18:56:42 +02:00
Viktor Lofgren
5d885927b4 (search) Fix layout and presentation 2025-07-18 17:54:47 +02:00
Viktor Lofgren
7622c8358e (request-classifier) Adjust flagging of a few hosts 2025-07-18 17:54:46 +02:00
Viktor Lofgren
69ed9aef47 (ddgt) Load global tracker data 2025-07-18 17:02:50 +02:00
Viktor Lofgren
4c78c223da (search) Fix endpoint collection 2025-07-18 16:59:05 +02:00
Viktor Lofgren
71b9935dd6 (search) Add warmup to programmatic tailwind classes, fix word break 2025-07-18 16:49:31 +02:00
Viktor Lofgren
ad38f2fd83 (search) Hide classification tag on unclassified requests 2025-07-18 15:45:40 +02:00
Viktor Lofgren
9c47388846 (search) Improve display ordering 2025-07-18 15:44:55 +02:00
Viktor Lofgren
d9ab10e33f (search) Fix tracker data for the correct domain 2025-07-18 15:29:15 +02:00
Viktor Lofgren
e13ea7f42b (search) Sort results by classifications 2025-07-18 14:51:35 +02:00
Viktor Lofgren
f38daeb036 (WIP) First stab at a GUI for viewing network traffic
The change also moves the dom classifier to a separate package so that it can be accessed from both the search service and converter.

The change also adds a parser for DDG's tracker radar data.
2025-07-18 13:58:57 +02:00
Viktor Lofgren
6e214293e5 (ping) Fix backoff value overflow 2025-07-16 19:50:12 +02:00
Viktor Lofgren
52582a6d7d (experiment) Also add clients to loom experiment 2025-07-16 18:08:00 +02:00
Viktor Lofgren
ec0e39ad32 (experiment) Also add clients to loom experiment 2025-07-16 17:28:57 +02:00
Viktor Lofgren
6a15aee4b0 (ping) Fix arithmetic errors in backoff strategy due to long overflow 2025-07-16 17:23:36 +02:00
Viktor Lofgren
bd5111e8a2 (experimental) Add flag for using loom/virtual threads in gRPC executor 2025-07-16 17:12:07 +02:00
Viktor Lofgren
1ecbeb0272 (doc) Update ROADMAP.md 2025-07-14 13:38:34 +02:00
Viktor Lofgren
b91354925d (converter) Index documents even when they are short
... but assign short documents a special flag and penalize them in index lookups
2025-07-14 12:24:25 +02:00
Viktor Lofgren
3f85c9c154 (refac) Clean up code 2025-07-14 11:55:21 +02:00
Viktor Lofgren
390f053406 (api) Add query parameter 'dc' for specifying the max number of results per domain 2025-07-14 10:09:30 +02:00
Viktor Lofgren
89e03d6914 (chore) Idiomatic error handling in gRPC clients
responseObserver.onError(...) should be passed Status.WHATEVER.foo().asRuntimeException() and not random throwables as was done before.
2025-07-13 02:59:22 +02:00
Viktor Lofgren
14e0bc9f26 (index) Add comment about encoding caveat 2025-07-13 02:47:00 +02:00
Viktor Lofgren
7065b46c6f (index) Add penalties for new feature flags from dom sample 2025-07-13 02:37:30 +02:00
Viktor Lofgren
0372190c90 (index, refac) Move domain ranking to a better named package 2025-07-13 02:37:29 +02:00
Viktor Lofgren
ceaf32fb90 (converter) Integrate dom sample features into the converter 2025-07-13 01:38:28 +02:00
Viktor Lofgren
b03c43224c (search) Fix redirects in new search UI 2025-07-11 23:44:45 +02:00
Viktor Lofgren
b57db01415 (converter) Clean out some old and redundant advertisement and tracking detection code 2025-07-11 19:32:25 +02:00
Viktor Lofgren
ce7d522608 (converter) First basic hook-in of the new dom sample classifier into the converter workflow 2025-07-11 16:57:37 +02:00
Viktor Lofgren
18649b6ee9 (converter) Move DomSampleClassifier to converter's code tree 2025-07-11 16:12:48 +02:00
Viktor Lofgren
f6417aef1a (converter) Additional code cleanup 2025-07-11 15:58:48 +02:00
Viktor Lofgren
2aa7e376b0 (converter) Clean up code around document deduplication 2025-07-11 15:54:28 +02:00
Viktor Lofgren
f33bc44860 (dom-sample) Create API for fetching DOM sample data across services 2025-07-11 15:41:10 +02:00
Viktor Lofgren
a2826efd44 (dom-sample) First stab at classifying outgoing requests from DOM sample data 2025-07-11 15:41:10 +02:00
522 changed files with 13280 additions and 8703 deletions

9
.gitignore vendored
View File

@@ -7,4 +7,11 @@ build/
lombok.config
Dockerfile
run
jte-classes
jte-classes
.classpath
.project
.settings
.factorypath
bin/
*.log
*.hprof

View File

@@ -48,10 +48,6 @@ filter for any API consumer.
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
## Show favicons next to search results
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
## Specialized crawler for github
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
that direction as well.
## Show favicons next to search results (COMPLETED 2025-03)
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
## Web Design Overhaul (COMPLETED 2025-01)
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.

View File

@@ -6,6 +6,7 @@ plugins {
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
// https://github.com/GoogleContainerTools/jib/issues/3347
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
id 'com.adarshr.test-logger' version '4.0.0'
}
group 'marginalia'
@@ -31,7 +32,10 @@ subprojects.forEach {it ->
jvmArgs += ['--enable-preview']
}
it.tasks.withType(Test).configureEach {
jvmArgs += ['--enable-preview']
jvmArgs += ['--enable-preview',
'--enable-native-access=ALL-UNNAMED',
'--sun-misc-unsafe-memory-access=allow',
'-Dsystem.uringQueueCount=1']
}
// Enable reproducible builds for the entire project

View File

@@ -114,4 +114,7 @@ public class WmsaHome {
}
public static Path getLangugeConfig() {
return getHomePath().resolve("conf/languages.xml");
}
}

View File

@@ -6,7 +6,6 @@ import com.google.inject.name.Named;
import gnu.trove.list.TLongList;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -14,7 +13,6 @@ import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.sql.Connection;
import java.sql.DriverManager;
@@ -104,7 +102,7 @@ public class DocumentDbReader {
}
try (var stmt = connection.prepareStatement("""
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
SELECT ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
FROM DOCUMENT WHERE ID = ?
""")) {
for (int i = 0; i < ids.size(); i++) {
@@ -118,6 +116,7 @@ public class DocumentDbReader {
url,
rs.getString("TITLE"),
rs.getString("DESCRIPTION"),
rs.getString("LANGUAGE"),
rs.getDouble("QUALITY"),
rs.getString("FORMAT"),
rs.getInt("FEATURES"),

View File

@@ -41,8 +41,8 @@ public class DocumentDbWriter {
public void add(List<DocdbUrlDetail> docdbUrlDetail) throws SQLException {
try (var stmt = connection.prepareStatement("""
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""")) {
int i = 0;
@@ -54,15 +54,16 @@ public class DocumentDbWriter {
stmt.setString(3, document.title());
stmt.setString(4, document.description());
stmt.setInt(5, document.wordsTotal());
stmt.setString(6, document.format());
stmt.setInt(7, document.features());
stmt.setLong(8, document.dataHash());
stmt.setDouble(9, document.urlQuality());
stmt.setString(5, document.language());
stmt.setInt(6, document.wordsTotal());
stmt.setString(7, document.format());
stmt.setInt(8, document.features());
stmt.setLong(9, document.dataHash());
stmt.setDouble(10, document.urlQuality());
if (document.pubYear() == null) {
stmt.setInt(10, 0);
stmt.setInt(11, 0);
} else {
stmt.setInt(10, document.pubYear());
stmt.setInt(11, document.pubYear());
}
stmt.addBatch();

View File

@@ -6,6 +6,7 @@ public record DocdbUrlDetail(long urlId,
EdgeUrl url,
String title,
String description,
String language,
double urlQuality,
String format,
int features,

View File

@@ -6,6 +6,7 @@ CREATE TABLE DOCUMENT (
STATE INT,
TITLE TEXT NOT NULL,
DESCRIPTION TEXT NOT NULL,
LANGUAGE TEXT NOT NULL,
WORDS_TOTAL INTEGER NOT NULL,
FORMAT TEXT NOT NULL,

View File

@@ -23,6 +23,7 @@ public class DocumentDbWriterTest {
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
"Test",
"This is a test",
"en",
-4.,
"XHTML",
5,

View File

@@ -5,13 +5,15 @@ import java.util.Collection;
public enum HtmlFeature {
// Note, the first 32 of these features are bit encoded in the database
// so be sure to keep anything that's potentially important toward the top
// of the list
// of the list; but adding new values will shift the encoded values and break
// binary compatibility! Scroll down for a marker where you should add new values
// if they need to be accessible from IndexResultScoreCalculator!
MEDIA( "special:media"),
JS("special:scripts"),
AFFILIATE_LINK( "special:affiliate"),
TRACKING("special:tracking"),
TRACKING_ADTECH("special:ads"), // We'll call this ads for now
TRACKING_ADTECH("special:adtech"),
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
LONG_URL("special:longurl"),
@@ -30,6 +32,15 @@ public enum HtmlFeature {
PDF("format:pdf"),
POPOVER("special:popover"),
CONSENT("special:consent"),
SHORT_DOCUMENT("special:shorty"),
THIRD_PARTY_REQUESTS("special:3pr"),
// Here! It is generally safe to add additional values here without
// disrupting the encoded values used by the DocumentValuator
// class in the index!
/** For fingerprinting and ranking */
OPENGRAPH("special:opengraph"),
OPENGRAPH_IMAGE("special:opengraph:image"),
@@ -67,6 +78,7 @@ public enum HtmlFeature {
S3_FEATURE("special:s3"),
MISSING_DOM_SAMPLE("special:nosample"),
UNKNOWN("special:uncategorized");

View File

@@ -7,7 +7,6 @@ public enum ServiceId {
Search("search-service"),
Index("index-service"),
Query("query-service"),
Executor("executor-service"),
Control("control-service"),

View File

@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.util.NamedExecutorFactory;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.function.Function;
@Singleton
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
private final NodeConfigurationWatcher nodeConfigurationWatcher;
private final ServiceRegistryIf serviceRegistryIf;
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final Executor executor = useLoom
? Executors.newVirtualThreadPerTaskExecutor()
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
private static final Executor offloadExecutor = useLoom
? Executors.newVirtualThreadPerTaskExecutor()
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
@Inject
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,

View File

@@ -2,6 +2,7 @@ package nu.marginalia.service.client;
import com.google.common.collect.Sets;
import io.grpc.ManagedChannel;
import io.grpc.StatusRuntimeException;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
import nu.marginalia.service.discovery.property.PartitionTraits;
@@ -206,6 +207,11 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
}
for (var e : exceptions) {
if (e instanceof StatusRuntimeException se) {
throw se; // Re-throw SRE as-is
}
// If there are other exceptions, log them
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
}

View File

@@ -1,9 +1,9 @@
package nu.marginalia.service.server;
import io.grpc.Server;
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
import io.grpc.netty.shaded.io.netty.channel.nio.NioEventLoopGroup;
import io.grpc.netty.shaded.io.netty.channel.socket.nio.NioServerSocketChannel;
import io.grpc.netty.NettyServerBuilder;
import io.netty.channel.nio.NioEventLoopGroup;
import io.netty.channel.socket.nio.NioServerSocketChannel;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition;
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class GrpcServer {
private final Server server;
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
public GrpcServer(ServiceConfiguration config,
ServiceRegistryIf serviceRegistry,
ServicePartition partition,
@@ -26,13 +31,19 @@ public class GrpcServer {
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
// Start the gRPC server
ExecutorService workExecutor = useLoom ?
Executors.newVirtualThreadPerTaskExecutor() :
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
.executor(workExecutor)
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
.channelType(NioServerSocketChannel.class);
for (var grpcService : grpcServices) {
if (!grpcService.shouldRegisterService()) {
continue;
}

View File

@@ -125,8 +125,7 @@ public class JoobyService {
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
// scenario
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
options.setWorkerThreads(Math.min(16, options.getWorkerThreads()));
jooby.setServerOptions(options);

View File

@@ -66,7 +66,7 @@ public class NodeStatusWatcher {
fileStorageService.createStorageBase("Crawl Data", Path.of("/storage"), nodeId, FileStorageBaseType.STORAGE);
fileStorageService.createStorageBase("Work Area", Path.of("/work"), nodeId, FileStorageBaseType.WORK);
persistence.sendNewMessage("executor-service:"+nodeId,
persistence.sendNewMessage("index-service:"+nodeId,
null,
null,
"FIRST-BOOT",

View File

@@ -189,7 +189,7 @@ public class ExecutorClient {
String uriPath = "/transfer/file/" + fileStorage.id();
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
if (endpoints.isEmpty()) {
throw new RuntimeException("No endpoints for node " + fileStorage.node());
}

View File

@@ -22,7 +22,6 @@ dependencies {
implementation project(':code:processes:ping-process')
implementation project(':code:processes:new-domain-process')
implementation project(':code:processes:converting-process')
implementation project(':code:processes:index-constructor-process')
implementation project(':code:common:config')
implementation project(':code:common:model')
@@ -34,7 +33,7 @@ dependencies {
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:message-queue')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:functions:language-processing')
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:live-capture:api')

View File

@@ -1,6 +1,7 @@
package nu.marginalia.execution;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.actor.ExecutorActor;
import nu.marginalia.actor.ExecutorActorControlService;
@@ -36,7 +37,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -52,7 +53,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -66,7 +67,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -80,7 +81,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -98,7 +99,7 @@ public class ExecutorCrawlGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}

View File

@@ -2,6 +2,7 @@ package nu.marginalia.execution;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.actor.ExecutorActor;
import nu.marginalia.actor.ExecutorActorControlService;
@@ -38,7 +39,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -57,7 +58,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -73,7 +74,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -87,7 +88,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -99,7 +100,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -114,14 +115,14 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
if (serviceConfiguration.node() != 1) {
responseObserver.onError(new IllegalArgumentException("Export all atags is only available on node 1"));
responseObserver.onError(Status.UNAVAILABLE.withDescription("Export all atags is only available on node 1").asRuntimeException());
}
try {
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
@@ -131,7 +132,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -145,7 +146,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -159,7 +160,7 @@ public class ExecutorExportGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.execution;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.WmsaHome;
import nu.marginalia.actor.ActorApi;
@@ -58,7 +59,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -70,7 +71,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -82,7 +83,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -96,7 +97,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -112,7 +113,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -128,7 +129,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -203,7 +204,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -229,7 +230,7 @@ public class ExecutorGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -276,7 +277,7 @@ public class ExecutorGrpcService
}
catch (Exception e) {
logger.error("Failed to update nsfw filters", e);
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.execution;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.actor.ExecutorActor;
import nu.marginalia.actor.ExecutorActorControlService;
@@ -33,7 +34,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -48,7 +49,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -63,7 +64,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -78,7 +79,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@@ -93,7 +94,7 @@ public class ExecutorSideloadGrpcService
responseObserver.onCompleted();
}
catch (Exception e) {
responseObserver.onError(e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}

View File

@@ -5,7 +5,6 @@ import com.google.inject.Singleton;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.ConverterMain;
import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.index.IndexConstructorMain;
import nu.marginalia.livecrawler.LiveCrawlerMain;
import nu.marginalia.loading.LoaderMain;
import nu.marginalia.ndp.NdpMain;
@@ -57,7 +56,7 @@ public class ProcessSpawnerService {
LIVE_CRAWLER(LiveCrawlerMain.class),
CONVERTER(ConverterMain.class),
LOADER(LoaderMain.class),
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
NDP(NdpMain.class),
EXPORT_TASKS(ExportTasksMain.class),
;
@@ -66,6 +65,9 @@ public class ProcessSpawnerService {
ProcessId(Class<? extends ProcessMainClass> mainClass) {
this.mainClass = mainClass.getName();
}
ProcessId(String mainClassFullName) {
this.mainClass = mainClassFullName;
}
List<String> envOpts() {
String variable = switch (this) {

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import com.google.inject.Inject;
import nu.marginalia.storage.FileStorageService;

View File

@@ -1,5 +1,5 @@
The execution subsystem is responsible for the execution of long running tasks on each
index node. It lives in the [executor-service](../services-core/executor-service) module.
index node. It lives in the [index-service](../services-core/index-service) module.
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
which permits program state to survive crashes and reboots.

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.concurrent.*;
import nu.marginalia.api.domains.model.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
@Singleton
public class DomainInfoClient {
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
private final ExecutorService executor = Executors.newWorkStealingPool(8);
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
@Inject
public DomainInfoClient(GrpcChannelPoolFactory factory) {

View File

@@ -1,8 +1,7 @@
plugins {
id 'java'
id 'jvm-test-suite'
id 'gg.jte.gradle' version '3.1.15'
}
java {
@@ -14,18 +13,18 @@ java {
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation libs.bundles.slf4j
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:commons-codec')
implementation project(':third-party:openzim')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:array')
implementation project(':code:libraries:blocking-thread-pool')
implementation libs.bundles.slf4j
implementation project(':code:libraries:coded-sequence')
implementation libs.notnull
implementation libs.bundles.jooby
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
@@ -42,3 +41,9 @@ dependencies {
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
jte {
sourceDirectory = file('resources/ltt/jte').toPath()
targetDirectory = file('build/classes/jte-precompiled').toPath()
generate()
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic.dom;
package nu.marginalia.dom;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.logic.dom;
package nu.marginalia.dom;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

View File

@@ -16,8 +16,6 @@ public class DocumentKeywordExtractor {
private final TermFrequencyDict dict;
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
@@ -37,35 +35,54 @@ public class DocumentKeywordExtractor {
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) {
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
if (dld.language().hasPosParsing()) {
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var positionMapper = new DocumentPositionMapper();
var keywordMetadata = KeywordMetadata.builder()
.titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords)
.subjectLikeKeywords(subjectLikeKeywords)
.urlKeywords(urlKeywords)
.build();
var tfIdfCounts = new WordsTfIdfCounts(dict, dld);
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
var titleKeywords = new TitleKeywords(dld);
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
var keywordMetadata = KeywordMetadata.builder()
.titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords)
.subjectLikeKeywords(subjectLikeKeywords)
.urlKeywords(urlKeywords)
.build();
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
wordsBuilder.addImportantWords(importantWords);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
return wordsBuilder;
wordsBuilder.addImportantWords(importantWords);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
return wordsBuilder;
}
else {
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var positionMapper = new DocumentPositionMapper();
var keywordMetadata = KeywordMetadata.builder()
.urlKeywords(urlKeywords)
.build();
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
return wordsBuilder;
}
}
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {

View File

@@ -3,7 +3,9 @@ package nu.marginalia.keyword;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.pos.PosPatternCategory;
import nu.marginalia.language.sentence.tag.HtmlTag;
import java.util.ArrayList;
@@ -17,8 +19,6 @@ import static java.lang.Math.sqrt;
*/
public class DocumentPositionMapper {
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData dld,
@@ -38,12 +38,14 @@ public class DocumentPositionMapper {
}
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
public int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData dld)
{
LanguageDefinition languageDefinition = dld.language();
List<SpanRecorder> spanRecorders = new ArrayList<>();
for (var htmlTag : HtmlTag.includedTags) {
if (!htmlTag.exclude) {
@@ -80,7 +82,7 @@ public class DocumentPositionMapper {
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
for (var names : languageDefinition.matchGrammarPattern(sent, PosPatternCategory.NAME)) {
WordRep rep = new WordRep(sent, names);
byte meta = metadata.getMetadataForWord(rep.stemmed);
@@ -193,48 +195,4 @@ public class DocumentPositionMapper {
return false;
}
/** Helper class to record spans of words */
private static class SpanRecorder {
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
private final HtmlTag htmlTag;
private int start = 0;
public SpanRecorder(HtmlTag htmlTag) {
this.htmlTag = htmlTag;
}
public void update(DocumentSentence sentence, int pos) {
assert pos > 0;
if (sentence.htmlTags.contains(htmlTag)) {
if (start <= 0) start = pos;
}
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
{
// special case for body tag, we match against no tag on the sentence
if (start <= 0) start = pos;
}
else {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
}
public void endCurrentSpan(int pos) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
start = 0;
}
return spans;
}
}
}

View File

@@ -6,18 +6,24 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
import nu.marginalia.keyword.extractors.UrlKeywords;
import nu.marginalia.model.idx.WordFlags;
import javax.annotation.Nullable;
public class KeywordMetadata {
@Nullable
private final TitleKeywords titleKeywords;
@Nullable
private final NameLikeKeywords nameLikeKeywords;
@Nullable
private final SubjectLikeKeywords subjectLikeKeywords;
@Nullable
private final UrlKeywords urlKeywords;
public KeywordMetadata(
TitleKeywords titleKeywords,
NameLikeKeywords nameLikeKeywords,
SubjectLikeKeywords subjectLikeKeywords,
UrlKeywords urlKeywords) {
@Nullable TitleKeywords titleKeywords,
@Nullable NameLikeKeywords nameLikeKeywords,
@Nullable SubjectLikeKeywords subjectLikeKeywords,
@Nullable UrlKeywords urlKeywords) {
this.titleKeywords = titleKeywords;
this.nameLikeKeywords = nameLikeKeywords;
this.subjectLikeKeywords = subjectLikeKeywords;
@@ -32,23 +38,23 @@ public class KeywordMetadata {
byte flags = 0;
if (subjectLikeKeywords.contains(stemmed)) {
if (subjectLikeKeywords != null && subjectLikeKeywords.contains(stemmed)) {
flags |= WordFlags.Subjects.asBit();
}
if (nameLikeKeywords.contains(stemmed)) {
if (nameLikeKeywords != null && nameLikeKeywords.contains(stemmed)) {
flags |= WordFlags.NamesWords.asBit();
}
if (titleKeywords.contains(stemmed)) {
if (titleKeywords != null && titleKeywords.contains(stemmed)) {
flags |= WordFlags.Title.asBit();
}
if (urlKeywords.containsUrl(stemmed)) {
if (urlKeywords != null && urlKeywords.containsUrl(stemmed)) {
flags |= WordFlags.UrlPath.asBit();
}
if (urlKeywords.containsDomain(stemmed)) {
if (urlKeywords != null && urlKeywords.containsDomain(stemmed)) {
flags |= WordFlags.UrlDomain.asBit();
}

View File

@@ -0,0 +1,52 @@
package nu.marginalia.keyword;
import nu.marginalia.keyword.model.DocumentWordSpan;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.sentence.tag.HtmlTag;
import java.util.ArrayList;
import java.util.List;
/**
* Helper class to record spans of words
*/
class SpanRecorder {
private final List<DocumentWordSpan> spans = new ArrayList<>();
private final HtmlTag htmlTag;
private int start = 0;
public SpanRecorder(HtmlTag htmlTag) {
this.htmlTag = htmlTag;
}
public void update(DocumentSentence sentence, int pos) {
assert pos > 0;
if (sentence.htmlTags.contains(htmlTag)) {
if (start <= 0) start = pos;
} else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) {
// special case for body tag, we match against no tag on the sentence
if (start <= 0) start = pos;
} else {
if (start > 0) {
spans.add(new DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
}
public void endCurrentSpan(int pos) {
if (start > 0) {
spans.add(new DocumentWordSpan(htmlTag, start, pos));
start = 0;
}
}
public List<DocumentWordSpan> finish(int length) {
if (start > 0) {
spans.add(new DocumentWordSpan(htmlTag, start, length));
start = 0;
}
return spans;
}
}

View File

@@ -2,11 +2,11 @@ package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.pos.PosPatternCategory;
import java.util.*;
import java.util.stream.Collectors;
@@ -16,12 +16,14 @@ public class NameLikeKeywords implements WordReps {
private final List<WordRep> nameWords;
private final Set<String> stemmed;
public NameLikeKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData dld, int minCount) {
var counts = new Object2IntOpenHashMap<String>(100);
var instances = new HashMap<String, HashSet<WordRep>>(100);
public NameLikeKeywords(DocumentLanguageData dld, int minCount) {
LanguageDefinition languageDefinition = dld.language();
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<String>(100);
HashMap<String, HashSet<WordRep>> instances = new HashMap<String, HashSet<WordRep>>(100);
for (DocumentSentence sent : dld) {
var keywords = keywordExtractor.getProperNames(sent);
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.NAME);
for (var span : keywords) {
if (span.size() <= 1 && sent.isAllCaps(span.start))
continue;

View File

@@ -1,11 +1,11 @@
package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.pos.PosPatternCategory;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
@@ -23,25 +23,18 @@ public class SubjectLikeKeywords implements WordReps {
// Greeks bearing gifts -> Greeks
// Steve McQueen drove fast | cars -> Steve McQueen
public SubjectLikeKeywords(KeywordExtractor keywordExtractor,
WordsTfIdfCounts tfIdfCounts,
public SubjectLikeKeywords(WordsTfIdfCounts tfIdfCounts,
DocumentLanguageData dld) {
LanguageDefinition languageDefinition = dld.language();
Map<String, Set<WordRep>> instances = new HashMap<>();
for (var sentence : dld) {
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
if (kw.end + 2 >= sentence.length()) {
continue;
}
if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1))
for (WordSpan kw : languageDefinition.matchGrammarPattern(sentence, PosPatternCategory.NOUN)) {
if (sentence.nextCommaPos(kw.end - 1) <= kw.end)
continue;
String nextTag = sentence.posTags[kw.end];
String nextNextTag = sentence.posTags[kw.end+1];
if (isVerb(nextTag) && isDetOrAdverbOrVerbOrNoun(nextNextTag)) {
if (languageDefinition.matchGrammarPattern(sentence, PosPatternCategory.SUBJECT_SUFFIX, kw.end)) {
var span = new WordSpan(kw.start, kw.end);
var rep = new WordRep(sentence, span);
@@ -94,17 +87,4 @@ public class SubjectLikeKeywords implements WordReps {
return tfIdfCounts.getTfIdf(stemmed);
}
private boolean isDetOrAdverbOrVerbOrNoun(String posTag) {
return "DT".equals(posTag) // determinant
|| posTag.startsWith("RB") // adverb
|| posTag.startsWith("VB") // verb
|| posTag.startsWith("JJ") // adjective
|| posTag.startsWith("P")
|| posTag.startsWith("NN");
}
boolean isVerb(String posTag) {
return posTag.startsWith("VB")
&& !posTag.equals("VB"); // not interested in the infinitive
}
}

View File

@@ -1,8 +1,7 @@
package nu.marginalia.keyword.extractors;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.sentence.tag.HtmlTag;
@@ -15,10 +14,12 @@ public class TitleKeywords implements WordReps {
private final Set<WordRep> titleKeywords;
private final Set<String> stemmed;
public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) {
titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream()
public TitleKeywords(DocumentLanguageData dld) {
LanguageDefinition languageDefinition = dld.language();
titleKeywords = dld.findSentencesForTag(HtmlTag.TITLE).stream()
.flatMap(sent ->
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
languageDefinition.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
.limit(100)
.collect(Collectors.toSet());

View File

@@ -1,4 +1,4 @@
package nu.marginalia.keyword;
package nu.marginalia.keyword.extractors;
import nu.marginalia.language.model.WordRep;

View File

@@ -1,10 +1,10 @@
package nu.marginalia.keyword.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.pos.PosPatternCategory;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;
@@ -26,14 +26,13 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
private final Set<WordRep> tfIdfHigh;
public WordsTfIdfCounts(TermFrequencyDict dict,
KeywordExtractor keywordExtractor,
DocumentLanguageData dld) {
this.dict = dict;
this.docCount = dict.docCount();
this.tfIdf = new Object2IntOpenHashMap<>(10_000);
this.tfIdfHigh = new HashSet<>(100);
var counts = getCounts(keywordExtractor, dld);
var counts = getCounts(dld);
int maxVal = maxValue(counts);
Set<String> highTfIdfInstances = new HashSet<>();
@@ -48,9 +47,10 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
// Collect words with a high TF-IDF so that they can be marked with a bit flag
tfIdfHigh = new HashSet<>(100);
LanguageDefinition languageDefinition = dld.language();
for (var sent : dld) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.KEYWORD);
for (var span : keywords) {
if (highTfIdfInstances.contains(sent.constructStemmedWordFromSpan(span))) {
tfIdfHigh.add(new WordRep(sent, span));
@@ -60,12 +60,14 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
}
private Object2IntOpenHashMap<String> getCounts(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
private Object2IntOpenHashMap<String> getCounts(DocumentLanguageData dld) {
LanguageDefinition languageDefinition = dld.language();
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
counts.defaultReturnValue(0);
for (var sent : dld) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.KEYWORD);
for (var span : keywords) {
counts.addTo(sent.constructStemmedWordFromSpan(span), 1);
}

View File

@@ -0,0 +1,23 @@
package nu.marginalia.keyword.model;
import nu.marginalia.sequence.VarintCodedSequence;
import java.util.List;
public record DocumentKeywords(List<String> keywords,
byte[] metadata,
List<VarintCodedSequence> positions,
byte[] spanCodes,
List<VarintCodedSequence> spanSequences) {
public boolean isEmpty() {
return keywords.isEmpty();
}
public int size() {
return keywords.size();
}
}

View File

@@ -5,13 +5,11 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.sequence.VarintCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
import java.util.*;
public class DocumentKeywordsBuilder {
@@ -29,6 +27,7 @@ public class DocumentKeywordsBuilder {
// be plenty. The lexicon writer has another limit that's higher.
private final int MAX_WORD_LENGTH = 64;
private final int MAX_POSITIONS_PER_WORD = 512;
private final int MAX_SPANS_PER_TYPE = 8192;
private static final Logger logger = LoggerFactory.getLogger(DocumentKeywordsBuilder.class);
@@ -36,13 +35,22 @@ public class DocumentKeywordsBuilder {
this(1600);
}
public DocumentKeywords build(ByteBuffer workArea) {
public DocumentKeywordsBuilder(int capacity) {
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
wordToPos = new HashMap<>(capacity);
}
public DocumentKeywords build() {
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
final List<VarintCodedSequence> spanSequences = new ArrayList<>(wordSpans.size());
final byte[] spanCodes = new byte[wordSpans.size()];
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
// Encode positions
while (iter.hasNext()) {
var entry = iter.next();
@@ -59,27 +67,26 @@ public class DocumentKeywordsBuilder {
}
// Encode spans
List<CodedWordSpan> spans = new ArrayList<>(wordSpans.size());
wordSpans.forEach((tag, spansForTag) -> {
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
var positionsForTag = new IntArrayList(spansForTag.size() * 2);
for (var span : spansForTag) {
positionsForTag.add(span.start());
positionsForTag.add(span.end());
if (positionsForTag.size() >= MAX_SPANS_PER_TYPE)
break;
}
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
spanCodes[spanSequences.size()] = tag.code;
spanSequences.add(VarintCodedSequence.generate(positionsForTag));
});
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
return new DocumentKeywords(wordArray, meta.toArray(), positions, spanCodes, spanSequences);
}
public DocumentKeywordsBuilder(int capacity) {
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
wordToPos = new HashMap<>(capacity);
}
public void addMeta(String word, byte meta) {
if (word.length() > MAX_WORD_LENGTH)
@@ -113,6 +120,13 @@ public class DocumentKeywordsBuilder {
newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta));
}
public void addSyntheticTerm(String newWord) {
byte meta = WordFlags.Synthetic.asBit();
wordToMeta.putIfAbsent(newWord, meta);
}
public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>();
@@ -167,6 +181,4 @@ public class DocumentKeywordsBuilder {
return this.importantWords;
}
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
}
}

View File

@@ -0,0 +1,6 @@
package nu.marginalia.keyword.model;
import nu.marginalia.language.sentence.tag.HtmlTag;
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
}

View File

@@ -0,0 +1,157 @@
package nu.marginalia.language;
import io.jooby.Context;
import io.jooby.Jooby;
import io.jooby.MapModelAndView;
import io.jooby.ModelAndView;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
public class LanguageProcessingTool extends Jooby {
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final TermFrequencyDict termFrequencyDict;
static void main(String[] args) {
Jooby.runApp(args, LanguageProcessingTool::new);
}
public LanguageProcessingTool() {
try {
LanguageModels languageModels = getLanguageModels();
termFrequencyDict = new TermFrequencyDict(languageModels);
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
languageModels
);
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
System.out.println("Base path: " + basePath);
if (Files.exists(basePath.resolve("resources/ltt/jte")))
install(new nu.marginalia.service.server.jte.JteModule(basePath.resolve("resources/ltt/jte")));
if (Files.exists(basePath.resolve("resources/ltt/static")))
assets("/*", basePath.resolve("resources/ltt/static"));
get("/", this::handleKeywords);
post("/", this::handleKeywords);
}
catch (Exception ex) {
logger.error("Failed to initialize LanguageProcessingTool", ex);
throw new RuntimeException(ex);
}
}
// Assign colors to the POS tags
@NotNull
private ModelAndView<?> handleKeywords(Context context) {
if ("GET".equals(context.getMethod())) {
return new MapModelAndView("keywords.jte")
.put("textSample", "");
}
else if (!"POST".equals(context.getMethod())) {
throw new IllegalArgumentException("Invalid method");
}
String textSample = context.form("textSample").value();
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
Map<Long, String> posStyles = posTagStyles(dld);
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, dld);
var titleKeywords = new TitleKeywords(dld);
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
var artifactKeywords = new ArtifactKeywords(dld);
// var urlKeywords = new UrlKeywords(url);
return new MapModelAndView("keywords.jte")
.put("textSample", textSample)
.put("language", dld.language())
.put("tagColors", posStyles)
.put("sentences", dld.sentences())
.put("tfIdfReps", tfIdfCounts.getReps())
.put("titleReps", titleKeywords.getReps())
.put("nameLikeReps", nameLikeKeywords.getReps())
.put("subjectLikeReps", subjectLikeKeywords.getReps())
.put("artifacts", artifactKeywords.getWords());
}
public static Map<Long, String> posTagStyles(DocumentLanguageData dld) {
Map<Long, String> styles = new HashMap<>();
// we sort them first to ensure the most common tags are guaranteed to have
// the largest difference between colors
Map<Long, Integer> counts = new HashMap<>();
for (var sentence : dld.sentences()) {
for (var tag : sentence.posTags) {
counts.merge(tag, 1, Integer::sum);
}
}
List<Long> posTagsByCount = counts
.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.map(Map.Entry::getKey)
.toList();
for (int i = 0; i < posTagsByCount.size(); i++) {
String style = "text-" + switch (i&0x7) {
case 0 -> "red";
case 1 -> "green";
case 2 -> "blue";
case 3 -> "yellow";
case 4 -> "purple";
case 5 -> "cyan";
case 6 -> "pink";
default -> "gray";
}+"-"+switch((i/8) & 3) {
case 0 -> "900";
case 3 -> "500";
case 1 -> "750";
case 2 -> "400";
default -> "300";
};
styles.put(posTagsByCount.get(i), style);
}
return styles;
}
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
private static Path getLanguageModelsPath() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
return languageModelsHome;
}
private static LanguageModels getLanguageModels() {
var languageModelsHome = getLanguageModelsPath();
return new LanguageModels(
languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("lid.176.ftz"),
languageModelsHome.resolve("segments.bin")
);
}
}

View File

@@ -0,0 +1,43 @@
package nu.marginalia.language.config;
import nu.marginalia.WmsaHome;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
sealed public interface LanguageConfigLocation {
InputStream findLanguageConfiguration() throws IOException;
final class Auto implements LanguageConfigLocation {
@Override
public InputStream findLanguageConfiguration() throws IOException {
Path filesystemPath = WmsaHome.getLangugeConfig();
if (Files.exists(filesystemPath)) {
return Files.newInputStream(filesystemPath, StandardOpenOption.READ);
}
if (Boolean.getBoolean("language.experimental")) {
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
} else {
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
}
}
}
final class Experimental implements LanguageConfigLocation {
@Override
public InputStream findLanguageConfiguration() throws IOException {
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
}
}
final class Default implements LanguageConfigLocation {
@Override
public InputStream findLanguageConfiguration() throws IOException {
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
}
}
}

View File

@@ -0,0 +1,405 @@
package nu.marginalia.language.config;
import com.github.jfasttext.JFastText;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.encoding.UnicodeNormalization;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.pos.PosPattern;
import nu.marginalia.language.pos.PosPatternCategory;
import nu.marginalia.language.pos.PosTagger;
import nu.marginalia.language.stemming.Stemmer;
import org.jsoup.nodes.TextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.annotation.Nullable;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
@Singleton
public class LanguageConfiguration {
private static final Logger logger = LoggerFactory.getLogger(LanguageConfiguration.class);
private final Map<String, Path> resources = new HashMap<>();
private final Map<String, LanguageDefinition> languages = new LinkedHashMap<>();
private final JFastText fastTextLanguageModel = new JFastText();
public Optional<LanguageDefinition> identifyLanguage(org.jsoup.nodes.Document jsoupDoc) {
StringBuilder sampleBuilder = new StringBuilder();
jsoupDoc.body().traverse((node, _) -> {
if (sampleBuilder.length() > 4096)
return;
if (!(node instanceof TextNode tn))
return;
sampleBuilder.append(' ').append(tn.text());
});
return identifyLanguage(sampleBuilder.toString());
}
public Optional<LanguageDefinition> identifyLanguage(String sample) {
String prediction = fastTextLanguageModel.predict(sample);
if (null == prediction)
return Optional.empty();
if (prediction.length() == "__label__??".length()) {
String isoCode = prediction.substring("__label__".length());
return Optional.ofNullable(getLanguage(isoCode));
}
return Optional.empty();
}
public Optional<LanguageDefinition> identifyLanguage(String sample, String fallbackIsoCode) {
return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
}
public List<LanguageDefinition> languages() {
return new ArrayList<>(this.languages.values());
}
public Map<String, LanguageDefinition> languagesMap() {
return Collections.unmodifiableMap(languages);
}
@Nullable
public LanguageDefinition getLanguage(String language) {
return languages.get(language);
}
@Inject
public LanguageConfiguration() throws IOException, ParserConfigurationException, SAXException {
this(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Auto());
}
public LanguageConfiguration(LanguageConfigLocation languageFile) throws IOException, ParserConfigurationException, SAXException {
this(WmsaHome.getLanguageModels(), languageFile);
}
public LanguageConfiguration(LanguageModels lm, LanguageConfigLocation languageFile)
throws IOException, ParserConfigurationException, SAXException {
fastTextLanguageModel.loadModel(lm.fasttextLanguageModel.toString());
try (var languagesXmlStream = languageFile.findLanguageConfiguration()) {
if (languagesXmlStream == null)
throw new IllegalStateException("languages-default.xml resource not found in classpath");
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(languagesXmlStream);
parseResources(doc);
parseLanguages(doc);
}
logger.info("Loaded language configuration: {}", languages);
}
private void parseLanguages(Document doc) {
NodeList languageNodes = doc.getElementsByTagName("language");
for (int i = 0; i < languageNodes.getLength(); i++) {
Element languageTag = (Element) languageNodes.item(i);
boolean disabled = "TRUE".equalsIgnoreCase(languageTag.getAttribute("disabled"));
if (disabled)
continue;
String isoCode = languageTag.getAttribute("isoCode").toLowerCase();
String name = languageTag.getAttribute("name");
try {
PosTagger posTagger = parsePosTag(languageTag, isoCode);
Stemmer stemmer = parseStemmerTag(languageTag, posTagger, isoCode);
KeywordHasher keywordHasher = parseHasherTag(languageTag, isoCode);
Map<PosPatternCategory, List<PosPattern>> posPatterns =
parsePosPatterns(posTagger, languageTag, isoCode);
UnicodeNormalization unicodeNormalization = parseUnicodeNormalization(languageTag, isoCode);
languages.put(isoCode,
new LanguageDefinition(isoCode, name, stemmer, unicodeNormalization, keywordHasher, posTagger, posPatterns));
}
catch (IOException ex) {
logger.error("Failed to set up language " + isoCode, ex);
}
}
}
private UnicodeNormalization parseUnicodeNormalization(Element languageTag, String isoCode) {
NodeList normalizationTags = languageTag.getElementsByTagName("unicodeNormalization");
if (normalizationTags.getLength() == 0)
return new UnicodeNormalization.JustNormalizeQuotes();
Element normalizationTag = (Element) normalizationTags.item(0);
String algorithm = normalizationTag.getAttribute("algorithm");
return switch(algorithm) {
case "minimal" -> new UnicodeNormalization.JustNormalizeQuotes();
case "e-accents" -> new UnicodeNormalization.FlattenEAccents();
case "german" -> new UnicodeNormalization.Flattenß();
case "maximal-latin" -> new UnicodeNormalization.FlattenAllLatin();
default -> throw new IllegalArgumentException("Invalida algorithm " + algorithm + " on language configuration for " + isoCode);
};
}
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
Element languageTag, String isoCode) {
if (null == posTagger)
return Map.of();
Map<PosPatternCategory, List<PosPattern>> ret = new HashMap<>();
NodeList ngramsElements = languageTag.getElementsByTagName("ngrams");
for (int i = 0; i < ngramsElements.getLength(); i++) {
Element ngramsTag = (Element) ngramsElements.item(i);
String type = ngramsTag.getAttribute("type");
PosPatternCategory category = switch(type) {
case "name" -> PosPatternCategory.NAME;
case "noun" -> PosPatternCategory.NOUN;
case "keyword" -> PosPatternCategory.KEYWORD;
case "title" -> PosPatternCategory.TITLE;
case "subject-suffix" -> PosPatternCategory.SUBJECT_SUFFIX;
default -> throw new IllegalArgumentException("Invalid ngrams type in " + isoCode + ", what is '" + type + "'?");
};
NodeList posPatternsList = ngramsTag.getElementsByTagName("pospattern");
for (int j = 0; j < posPatternsList.getLength(); j++) {
Element posPatternTag = (Element) posPatternsList.item(j);
ret.computeIfAbsent(category, (k) -> new ArrayList<>())
.add(new PosPattern(posTagger, posPatternTag.getTextContent()));
}
}
return ret;
}
@Nullable
private PosTagger parsePosTag(Element languageTag, String isoCode) throws IOException {
NodeList rdrElements = languageTag.getElementsByTagName("rdrTagger");
if (rdrElements.getLength() < 1) {
return null;
}
else if (rdrElements.getLength() > 1) {
throw new IllegalStateException("Multiple rdr taggers defined in " + isoCode);
}
Element rdrElement = (Element) rdrElements.item(0);
String dictId = rdrElement.getAttribute("dictId");
String rdrId = rdrElement.getAttribute("rdrId");
Path dictPath = resources.get(dictId);
Path rdrPath = resources.get(rdrId);
if (null == dictPath)
throw new IllegalArgumentException("language.xml: dictPath id " + dictId
+ " does not map to a resource in " + isoCode);
if (null == rdrPath)
throw new IllegalArgumentException("language.xml: rdrPath id " + dictId
+ " does not map to a resource in " + isoCode);
return new PosTagger(isoCode, dictPath, rdrPath);
}
private KeywordHasher parseHasherTag(Element languageElement, String isoCode) {
NodeList keywordHasherElements = languageElement.getElementsByTagName("keywordHash");
if (keywordHasherElements.getLength() != 1) {
throw new IllegalArgumentException(
"language.xml: No keywordHasher block for language element " + isoCode);
}
Element keywordHasheElement = (Element) keywordHasherElements.item(0);
String hasherName = keywordHasheElement.getAttribute("algorithm");
return switch (hasherName) {
case "asciish" -> new KeywordHasher.AsciiIsh();
case "utf8" -> new KeywordHasher.Utf8();
default -> throw new IllegalArgumentException(
"language.xml: Unknown keywordHash name " + hasherName + " in " + isoCode);
};
}
private Stemmer parseStemmerTag(Element languageElement, PosTagger posTagger, String isoCode) {
NodeList stemmerElements = languageElement.getElementsByTagName("stemmer");
if (stemmerElements.getLength() != 1) {
throw new IllegalArgumentException(
"language.xml: No stemmer block for language element " + isoCode);
}
Element stemmerElement = (Element) stemmerElements.item(0);
String stemmerName = stemmerElement.getAttribute("algorithm");
String stemmerVariant = stemmerElement.getAttribute("variant");
PosPattern inclusionPattern = null;
NodeList posPatternList = stemmerElement.getElementsByTagName("pospattern");
if (posPatternList.getLength() >= 1) {
Element posElement = (Element) posPatternList.item(0);
inclusionPattern = new PosPattern(posTagger, posElement.getTextContent());
}
return switch (stemmerName.toLowerCase()) {
case "porter" -> new Stemmer.Porter(inclusionPattern);
case "snowball" -> new Stemmer.Snowball(stemmerVariant, inclusionPattern);
case "none" -> new Stemmer.NoOpStemmer();
default -> throw new IllegalArgumentException(
"language.xml: Unknown stemmer name " + stemmerName + " in " + isoCode);
};
}
private void parseResources(Document doc) throws IOException {
NodeList resourceNodes = doc.getElementsByTagName("resource");
for (int i = 0; i < resourceNodes.getLength(); i++) {
Element resourceTag = (Element) resourceNodes.item(i);
String resourceId = resourceTag.getAttribute("id");
String resourceMd5 = resourceTag.getAttribute("md5");
Path resourcePath = WmsaHome.getDataPath().resolve(resourceTag.getAttribute("path"));
String resourceHref = resourceTag.getAttribute("href");
if (!validateResource(resourcePath, resourceMd5)) {
boolean success = false;
try {
success = fetchResource(resourceHref, resourcePath, resourceMd5);
} catch (URISyntaxException | IOException ex) {
logger.error(ex.getMessage(), ex);
success = false;
}
// It's likely if we were to just explode here, that a docker-compose restart:always
// would put us in a
// loop that repeatedly fails to download the same file. We'd like to avoid that by
// stalling and
// awaiting human intervention.
while (!success) {
logger.error("Stopping to prevent restart loop");
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
if (resources.put(resourceId, resourcePath) != null)
throw new IllegalStateException(
"Resource with id " + resourceId + " already exists");
}
}
private boolean fetchResource(String resourceUrl, Path resourcePath, String resourceMd5)
throws IOException, URISyntaxException {
Path parentPath = resourcePath.getParent();
if (!Files.isDirectory(parentPath)) {
logger.info("Setting up directory {}", parentPath);
Files.createDirectories(parentPath);
}
logger.info("Fetching {}", resourceUrl);
URL url = new URI(resourceUrl).toURL();
Path tempFile = Files.createTempFile("resource", "dat");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
try (InputStream is = conn.getInputStream();
OutputStream os = Files.newOutputStream(tempFile, StandardOpenOption.WRITE,
StandardOpenOption.TRUNCATE_EXISTING)) {
is.transferTo(os);
os.flush();
String actualMd5 = getFileMD5(tempFile);
if (!resourceMd5.isBlank() && !Objects.equals(resourceMd5, actualMd5)) {
logger.error("Freshly downloaded resource {} does not match md5sum {}", resourceUrl,
resourceMd5);
return false;
} else {
logger.info("Downloaded resource {} to {} ** md5sum {}", resourceUrl, resourcePath,
actualMd5);
Files.move(tempFile, resourcePath, StandardCopyOption.REPLACE_EXISTING);
return true;
}
} catch (IOException ex) {
logger.error("IOException", ex);
return false;
} finally {
conn.disconnect();
Files.deleteIfExists(tempFile);
}
}
private boolean validateResource(Path resourcePath, String providedMd5Sum) throws IOException {
resourcePath = resourcePath.normalize();
if (!resourcePath.normalize().startsWith(WmsaHome.getDataPath()))
throw new IllegalArgumentException(
"Resource path has escaped $WMSA_HOME/data: " + resourcePath);
if (!Files.exists(resourcePath)) {
logger.info("Resource path does not exist: " + resourcePath);
return false;
}
String actualMd5 = getFileMD5(resourcePath);
if (providedMd5Sum.isBlank()) {
logger.info("No md5sum provided for resource path: {}, but was calculated to {}",
resourcePath, actualMd5);
return true;
}
if (Objects.equals(actualMd5, providedMd5Sum)) {
return true;
} else {
logger.error("MD5 checksum mismatch for {} -- {}", resourcePath, providedMd5Sum);
return false;
}
}
public String getFileMD5(Path filePath) {
try (InputStream fis = Files.newInputStream(filePath)) {
MessageDigest md = MessageDigest.getInstance("MD5");
DigestInputStream dis = new DigestInputStream(fis, md);
// Read the file
byte[] buffer = new byte[8192];
while (dis.read(buffer) != -1) {
// Reading updates the digest
}
byte[] digest = md.digest();
// Convert to hex
StringBuilder hexString = new StringBuilder();
for (byte b : digest) {
String hex = Integer.toHexString(0xff & b);
if (hex.length() == 1) {
hexString.append('0');
}
hexString.append(hex);
}
return hexString.toString();
} catch (IOException | NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}
}

View File

@@ -0,0 +1,223 @@
package nu.marginalia.language.encoding;
public interface UnicodeNormalization {
String flattenUnicode(String s);
static final boolean NO_FLATTEN_UNICODE =
Boolean.getBoolean("system.noFlattenUnicode");
class JustNormalizeQuotes implements UnicodeNormalization {
public String flattenUnicode(String s) {
if (NO_FLATTEN_UNICODE)
return s;
if (isPlainAscii(s)) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 10);
for (int i = 0; i < s.length(); ) {
int c = s.codePointAt(i);
i += Character.charCount(c);
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
}
sb.appendCodePoint(c);
}
return sb.toString();
}
}
class FlattenEAccents implements UnicodeNormalization {
public String flattenUnicode(String s) {
if (NO_FLATTEN_UNICODE)
return s;
if (isPlainAscii(s)) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 10);
int numCp = s.codePointCount(0, s.length());
for (int i = 0; i < numCp;) {
int c = s.codePointAt(i);
i+=Character.charCount(c);
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
}
else if ("é".indexOf(c) >= 0) {
sb.append('e');
}
sb.appendCodePoint(c);
}
return sb.toString();
}
}
class Flattenß implements UnicodeNormalization {
public String flattenUnicode(String s) {
if (NO_FLATTEN_UNICODE)
return s;
if (isPlainAscii(s)) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 10);
for (int i = 0; i < s.length(); ) {
int c = s.codePointAt(i);
i += Character.charCount(c);
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
} else if ('ß' == c) {
sb.append("ss");
}
sb.appendCodePoint(c);
}
return sb.toString();
}
}
class FlattenAllLatin implements UnicodeNormalization {
public String flattenUnicode(String s) {
if (NO_FLATTEN_UNICODE)
return s;
if (isPlainAscii(s)) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 10);
// Falsehoods programmers believe about the latin alphabet ;-)
for (int i = 0; i < s.length(); ) {
int c = s.codePointAt(i);
i += Character.charCount(c);
if ("\u201C\u201D".indexOf(c) >= 0) {
sb.append('"');
}
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
sb.append('a');
}
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
sb.append('b');
}
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
sb.append('c');
}
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
sb.append('d');
}
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
sb.append('e');
}
else if ("ḟƒ".indexOf(c) >= 0) {
sb.append('f');
}
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
sb.append('g');
}
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
sb.append('g');
}
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
sb.append('i');
}
else if ("ĵǰɉ".indexOf(c) >= 0) {
sb.append('j');
}
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
sb.append('k');
}
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
sb.append('l');
}
else if ("ḿṁṃ".indexOf(c) >= 0) {
sb.append('m');
}
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
sb.append('n');
}
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
sb.append('o');
}
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
sb.append('p');
}
else if ("".indexOf(c) >= 0) {
sb.append('q');
}
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
sb.append('r');
}
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
sb.append('s');
}
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
sb.append('t');
}
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
sb.append('u');
}
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
sb.append('v');
}
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
sb.append('w');
}
else if ("x̂ẍẋ".indexOf(c) >= 0) {
sb.append('x');
}
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
sb.append('y');
}
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
sb.append('z');
}
else if ("Þþ".indexOf(c) >= 0) {
sb.append("th");
}
else if ('ß' == c) {
sb.append("ss");
}
else if (isAscii(c)) {
sb.append((char) c);
}
}
return sb.toString();
}
}
private static boolean isPlainAscii(String s) {
for (int i = 0; i < s.length(); ) {
int c = s.codePointAt(i);
if (!isAscii(c))
return false;
i += Character.charCount(c);
}
return true;
}
private static boolean isAscii(int c) {
return (c & ~0x7f) == 0;
}
}

View File

@@ -0,0 +1,27 @@
package nu.marginalia.language.keywords;
import nu.marginalia.hash.MurmurHash3_128;
public sealed interface KeywordHasher {
MurmurHash3_128 hasher = new MurmurHash3_128();
long hashKeyword(String keyword);
/** Hash algorithm that seeds a Murmur128 algorithm with Java's string hashCode(), but
* then only looks at 7 bit ASCII for the Murmur calculations. This works well for English
* and similar languages, but falls apart completely for languages that are not dominated by
* the 7 bit ASCII subset.
*/
final class AsciiIsh implements KeywordHasher {
public long hashKeyword(String keyword) {
return hasher.hashNearlyASCII(keyword);
}
}
/** Hash algorithm that is based on Murmur128 folded over on itself to make a 64 bit key */
final class Utf8 implements KeywordHasher {
public long hashKeyword(String keyword) {
return hasher.hashUtf8(keyword);
}
}
}

View File

@@ -15,11 +15,13 @@ import java.util.stream.Stream;
*
* @see SentenceExtractor
*/
public record DocumentLanguageData(List<DocumentSentence> sentences, String text) implements Iterable<DocumentSentence> {
public record DocumentLanguageData(LanguageDefinition language,
List<DocumentSentence> sentences,
String text) implements Iterable<DocumentSentence> {
public DocumentLanguageData(List<DocumentSentence> sentences,
String text)
public DocumentLanguageData(LanguageDefinition language, List<DocumentSentence> sentences, String text)
{
this.language = language;
this.sentences = Collections.unmodifiableList(sentences);
this.text = text;
}

View File

@@ -19,13 +19,14 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
/** A span of words in a sentence */
public final String[] wordsLowerCase;
public final String[] stemmedWords;
public final String[] posTags;
public final long[] posTags;
/** A set of HTML tags that surround the sentence */
public final EnumSet<HtmlTag> htmlTags;
/** A bitset indicating whether the word is a stop word */
private final BitSet isStopWord;
private final BitSet includeInStemming;
/** A bitset indicating whether the word is capitalized */
private final BitSet isCapitalized;
@@ -37,16 +38,16 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
// where false = COMMA, true = SPACE
private final BitSet separators;
public SoftReference<WordSpan[]> keywords;
public DocumentSentence(BitSet separators,
String[] wordsLowerCase,
String[] posTags,
long[] posTags,
String[] stemmedWords,
EnumSet<HtmlTag> htmlTags,
BitSet isCapitalized,
BitSet isAllCaps
BitSet isAllCaps,
BitSet includeInStemming
)
{
this.separators = separators;
@@ -56,6 +57,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
this.htmlTags = htmlTags;
this.isCapitalized = isCapitalized;
this.isAllCaps = isAllCaps;
this.includeInStemming = includeInStemming;
isStopWord = new BitSet(wordsLowerCase.length);
@@ -87,6 +89,16 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
return !separators.get(i);
}
/** Returns the position of the next comma in the sentence,
* or sentence.length() if no remaining commas exist.
*/
public int nextCommaPos(int pos) {
int ret = separators.nextClearBit(pos);
if (ret < 0)
return separators.length();
return ret;
}
public String constructWordFromSpan(WordSpan span) {
if (span.size() == 1) {
return trimJunkCharacters(wordsLowerCase[span.start]);
@@ -153,10 +165,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
}
private boolean includeInStemming(int i) {
if (posTags[i].equals("IN") || posTags[i].equals("TO") || posTags[i].equals("CC") || posTags[i].equals("DT")) {
return false;
}
return true;
return includeInStemming.get(i);
}
@Override
@@ -199,7 +208,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
}
public String wordLowerCase() { return wordsLowerCase[pos]; }
public String posTag() { return posTags[pos]; }
public long posTag() { return posTags[pos]; }
public String stemmed() { return stemmedWords[pos]; }
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }

View File

@@ -0,0 +1,145 @@
package nu.marginalia.language.model;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.UnicodeNormalization;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.language.pos.PosPattern;
import nu.marginalia.language.pos.PosPatternCategory;
import nu.marginalia.language.pos.PosTagger;
import nu.marginalia.language.stemming.Stemmer;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public final class LanguageDefinition {
private final String isoCode;
private final String name;
private final Stemmer stemmer;
private final UnicodeNormalization unicodeNormalization;
private final KeywordHasher keywordHasher;
@Nullable
private final PosTagger posTagger;
private final Map<PosPatternCategory, List<PosPattern>> posPatterns;
public LanguageDefinition(String isoCode,
String name,
Stemmer stemmer,
UnicodeNormalization unicodeNormalization,
KeywordHasher keywordHasher,
@Nullable PosTagger posTagger,
Map<PosPatternCategory, List<PosPattern>> posPatterns) {
this.isoCode = isoCode;
this.name = name;
this.stemmer = stemmer;
this.unicodeNormalization = unicodeNormalization;
this.keywordHasher = keywordHasher;
this.posTagger = posTagger;
this.posPatterns = posPatterns;
}
public String isoCode() {
return isoCode;
}
public String displayName() {
return name;
}
public Stemmer stemmer() {
return stemmer;
}
@Nullable
public PosTagger posTagger() {
return posTagger;
}
public KeywordHasher keywordHasher() {
return keywordHasher;
}
public UnicodeNormalization unicodeNormalization() {
return unicodeNormalization;
}
public long[] posTagSentence(String[] words) {
if (posTagger == null) return new long[0];
return posTagger.tagSentence(words);
}
public boolean hasPosParsing() {
return posTagger != null;
}
public List<PosPattern> getPosPatterns(PosPatternCategory category) {
return posPatterns.getOrDefault(category, List.of());
}
public String decodePosTagName(long tagName) {
if (hasPosParsing())
return posTagger.decodeTagName(tagName);
return "";
}
public List<WordSpan> matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category) {
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
for (PosPattern pattern : getPosPatterns(category)) {
pattern.matchSentence(sentence, spans);
}
return spans;
}
public boolean matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category, int pos) {
for (var pattern : getPosPatterns(category)) {
if (pattern.isMatch(sentence, pos))
return true;
}
return false;
}
public boolean matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category, WordSpan span) {
for (var pattern : getPosPatterns(category)) {
if (pattern.size() != span.size())
continue;
if (pattern.isMatch(sentence, span.start))
return true;
}
return false;
}
public List<WordSpan> getWordsFromSentence(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>();
for (int k = 0; k < 4; k++) {
for (int i = k; i < sentence.length(); i++) {
var w = new WordSpan(i-k, i + 1);
if (isViableSpanForWord(sentence, w)) {
spans.add(w);
}
}
}
return spans;
}
private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) {
if (sentence.nextCommaPos(w.start) < w.end - 1)
return false;
if (!matchGrammarPattern(sentence, PosPatternCategory.TITLE, w))
return false;
String word = sentence.constructWordFromSpan(w);
return !word.isBlank() && WordPatterns.isNotJunkWord(word);
}
}

View File

@@ -0,0 +1,4 @@
package nu.marginalia.language.model;
public class UnsupportedLanguageException extends Exception {
}

View File

@@ -7,6 +7,8 @@ public class WordSpan implements Comparable<WordSpan> {
public final int end;
public WordSpan(int start, int end) {
assert end >= start;
this.start = start;
this.end = end;
}

View File

@@ -0,0 +1,236 @@
package nu.marginalia.language.pos;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordSpan;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.List;
public class PosPattern {
public final LongArrayList pattern = new LongArrayList();
private static final Logger logger = LoggerFactory.getLogger(PosPattern.class);
public long[] toArray() {
return pattern.toLongArray();
}
public int size() {
return pattern.size();
}
public PosPattern(PosTagger posTagger, String expression) {
for (List<String> variants : PosTagPatternParser.parse(posTagger, expression)) {
pattern.add(posTagger.encodeTagNames(variants));
}
if (pattern.isEmpty()) {
throw new IllegalArgumentException("Zero length patterns are not allowed");
}
}
public int matchSentence(DocumentSentence sentence, List<WordSpan> ret) {
long first = pattern.getLong(0);
int cnt = 0;
// Fast case for 1-length patterns
if (pattern.size() == 1) {
for (int i = 0; i < sentence.length(); i++) {
if (0L == (sentence.posTags[i] & first)) continue;
ret.add(new WordSpan(i, i+1));
cnt++;
}
return cnt;
}
pattern:
for (int i = 0; i <= sentence.length() - pattern.size(); i++) {
// Start by matching against the beginning of the pattern
// as a fast path
if (0L == (sentence.posTags[i] & first)) continue;
int j;
for (j = 1; j < pattern.size(); j++) {
if (0L == (sentence.posTags[i + j] & pattern.getLong(j)))
continue pattern;
}
// Ensure no commas exist in the sentence except for the last word
int nextCommaPos = sentence.nextCommaPos(i);
if (nextCommaPos < i + pattern.size() - 1) {
// note the i++ in the for loop will also be added here, so we're positioned after the next comma
// beginning of the next iteration
i = nextCommaPos;
continue;
}
// Finally add the span
ret.add(new WordSpan(i, i+j));
cnt++;
}
return cnt;
}
public boolean isMatch(DocumentSentence sentence, int pos) {
if (pos + pattern.size() > sentence.length()) {
return false;
}
long first = pattern.getLong(0);
if (0 == (sentence.posTags[pos] & first)) return false;
else if (pattern.size() == 1) return true;
int nextCommaPos = sentence.nextCommaPos(pos);
if (nextCommaPos < pos + pattern.size() - 1) {
return false;
}
for (int j = 1; j < pattern.size(); j++) {
if (0L == (sentence.posTags[pos+j] & pattern.getLong(j)))
return false;
}
return true;
}
/** Return a bit set for every position where this pattern matches the tag sequence provided */
public BitSet matchTagPattern(long[] tags) {
BitSet bs = new BitSet(tags.length);
// Fast case for length = 1
if (pattern.size() == 1) {
long patternVal = pattern.getLong(0);
for (int i = 0; i < tags.length; i++) {
bs.set(i, (patternVal & tags[i]) != 0L);
}
return bs;
}
pattern:
for (int i = 0; i <= tags.length - pattern.size(); i++) {
int j;
for (j = 0; j < pattern.size(); j++) {
if (0L == (tags[i+j] & pattern.getLong(j)))
continue pattern;
}
bs.set(i);
}
return bs;
}
}
class PosTagPatternParser {
private boolean inverted;
private boolean inParen;
private final List<List<String>> variants = new ArrayList<>();
private final List<String> allTags;
public PosTagPatternParser(PosTagger posTagger) {
allTags = Collections.unmodifiableList(posTagger.tags());
}
public static List<List<String>> parse(PosTagger posTagger, String expression) {
PosTagPatternParser patternBuilder = new PosTagPatternParser(posTagger);
for (String token : tokenize(expression)) {
switch (token) {
case "!" -> patternBuilder.invert();
case "(" -> patternBuilder.parenOpen();
case ")" -> patternBuilder.parenClose();
default -> patternBuilder.addToken(token);
}
}
return patternBuilder.variants;
}
private static List<String> tokenize(String expression) {
List<String> tokens = new ArrayList<>();
int pos = 0;
while (pos < expression.length()) {
char c = expression.charAt(pos);
if ("()!".indexOf(c) >= 0) {
tokens.add(expression.substring(pos, pos + 1));
pos++;
}
else if (Character.isSpaceChar(c)) {
pos++;
}
else {
int end = pos + 1;
while (end < expression.length()) {
int ce = expression.charAt(end);
if ("() ".indexOf(ce) >= 0) {
break;
}
else {
end++;
}
}
tokens.add(expression.substring(pos, end));
pos = end;
}
}
return tokens;
}
public void invert() {
inverted = true;
}
public void parenOpen() {
inParen = true;
beginToken();
}
public void parenClose() {
inParen = false;
inverted = false;
}
private void beginToken() {
variants.add(new ArrayList<>());
if (inverted)
variants.getLast().addAll(allTags);
}
public void addToken(String token) {
if (!inParen) beginToken();
List<String> tokensExpanded;
if (token.endsWith("*")) {
String prefix = token.substring(0, token.length() - 1);
tokensExpanded = allTags.stream().filter(str -> prefix.isEmpty() || str.startsWith(prefix)).toList();
}
else {
tokensExpanded = List.of(token);
}
if (inverted) {
variants.getLast().removeAll(tokensExpanded);
}
else {
variants.getLast().addAll(tokensExpanded);
}
if (!inParen) {
inverted = false;
}
}
}

View File

@@ -0,0 +1,9 @@
package nu.marginalia.language.pos;
public enum PosPatternCategory {
NAME,
NOUN,
KEYWORD,
TITLE,
SUBJECT_SUFFIX
}

View File

@@ -0,0 +1,130 @@
package nu.marginalia.language.pos;
import com.github.datquocnguyen.RDRPOSTagger;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
public class PosTagger {
private final RDRPOSTagger rdrposTagger;
public final Map<String, Integer> tagDict;
public final List<String> tagNames;
private final String isoCode;
public PosTagger(String isoCode, Path dictFilePath, Path rdrFilePath) throws IOException {
this.isoCode = isoCode;
rdrposTagger = new RDRPOSTagger(dictFilePath, rdrFilePath);
List<String> tagNames = new ArrayList<>();
HashMap<String, Integer> tags = new HashMap<>();
try (var linesStream = Files.lines(dictFilePath)) {
linesStream.map(line -> StringUtils.split(line, " ", 2))
.filter(line -> line.length==2)
.map(line -> line[1])
.distinct()
.forEach(tag -> {
tags.putIfAbsent(tag, tagNames.size());
tagNames.add(tag);
});
} catch (IOException e) {
throw new RuntimeException(e);
}
this.tagDict = Collections.unmodifiableMap(tags);
this.tagNames = Collections.unmodifiableList(tagNames);
}
/** Alternate constructor for tests */
public PosTagger(String isoCode, List<String> tags) {
this.isoCode = isoCode;
this.tagNames = tags.stream().distinct().toList();
this.tagDict = tags.stream().distinct().collect(Collectors.toMap(Function.identity(), tagNames::indexOf, (a,b)->a));
this.rdrposTagger = null;
}
public long[] tagSentence(String[] words) {
String[] tags;
// Unclear if this is necessary, but the library does have a different function for tagging English
if ("en".equalsIgnoreCase(isoCode)) {
tags = rdrposTagger.tagsForEnSentence(words);
}
else {
tags = rdrposTagger.tagSentence(words);
}
// Encode the tags as a bit mask. These will just have one (or zero) bits set
// but will match against more complex masks
long[] encodedTags = new long[tags.length];
for (int i = 0; i < encodedTags.length; i++) {
encodedTags[i] = encodeTagName(tags[i]);
}
return encodedTags;
}
public long encodeTagName(String tagName) {
Integer tag = tagDict.get(tagName);
if (tag == null) {
return 0L;
}
return 1L << tag;
}
public long encodeTagNames(List<String> tagNames) {
long ret = 0;
for (String tagName : tagNames) {
ret |= encodeTagName(tagName);
}
return ret;
}
public String decodeTagName(long encodedTag) {
if (encodedTag == 0)
return "?";
return tagName(Long.numberOfTrailingZeros(encodedTag));
}
public String tagName(int tagId) {
if (tagId < 0 || tagId >= tagNames.size())
return "?";
return tagNames.get(tagId);
}
public OptionalInt tagId(String tagName) {
Integer id = tagDict.get(tagName);
if (id == null)
return OptionalInt.empty();
return OptionalInt.of(id);
}
public List<String> tags() {
var ret = new ArrayList<>(tagDict.keySet());
ret.sort(Comparator.naturalOrder());
return ret;
}
public IntList tagIdsForPrefix(String tagNamePrefix) {
IntArrayList ret = new IntArrayList();
tagDict.entrySet().stream()
.filter(tag -> tag.getKey().startsWith(tagNamePrefix))
.mapToInt(Map.Entry::getValue)
.forEach(ret::add);
return ret;
}
@Override
public String toString() {
return "PosTaggingData{ tags=" + tagDict + '}';
}
}

View File

@@ -1,17 +1,20 @@
package nu.marginalia.language.sentence;
import com.github.datquocnguyen.RDRPOSTagger;
import com.google.inject.Inject;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.pos.PosPattern;
import nu.marginalia.language.sentence.tag.HtmlStringTagger;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.language.sentence.tag.HtmlTaggedString;
import nu.marginalia.language.stemming.Stemmer;
import nu.marginalia.segmentation.NgramLexicon;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
@@ -29,12 +32,11 @@ import java.util.*;
*/
public class SentenceExtractor {
private final LanguageConfiguration languageConfiguration;
private SentenceDetectorME sentenceDetector;
private static RDRPOSTagger rdrposTagger;
private static NgramLexicon ngramLexicon = null;
private final PorterStemmer porterStemmer = new PorterStemmer();
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner();
@@ -46,8 +48,10 @@ public class SentenceExtractor {
static final int MAX_SENTENCE_COUNT = 1000;
@Inject
public SentenceExtractor(LanguageModels models)
public SentenceExtractor(LanguageConfiguration languageConfiguration, LanguageModels models)
{
this.languageConfiguration = languageConfiguration;
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
var sentenceModel = new SentenceModel(modelIn);
sentenceDetector = new SentenceDetectorME(sentenceModel);
@@ -61,21 +65,14 @@ public class SentenceExtractor {
if (ngramLexicon == null) {
ngramLexicon = new NgramLexicon(models);
}
if (rdrposTagger == null) {
try {
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
} catch (Exception ex) {
throw new IllegalStateException(ex);
}
}
}
}
public DocumentLanguageData extractSentences(Document doc) {
public DocumentLanguageData extractSentences(Document doc) throws UnsupportedLanguageException {
var language = languageConfiguration.identifyLanguage(doc).orElseThrow(UnsupportedLanguageException::new);
final List<DocumentSentence> textSentences = new ArrayList<>();
final List<HtmlTaggedString> taggedStrings = HtmlStringTagger.tagDocumentStrings(doc);
final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum();
@@ -85,7 +82,7 @@ public class SentenceExtractor {
String text = taggedString.string();
textSentences.addAll(
extractSentencesFromString(text, taggedString.tags())
extractSentencesFromString(language, text, taggedString.tags())
);
if (documentText.isEmpty()) {
@@ -96,32 +93,62 @@ public class SentenceExtractor {
}
}
return new DocumentLanguageData(textSentences, documentText.toString());
return new DocumentLanguageData(language, textSentences, documentText.toString());
}
public DocumentLanguageData extractSentences(String text, String title) {
var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class));
var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
var textSentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
var titleSentences = extractSentencesFromString(language, title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
List<DocumentSentence> combined = new ArrayList<>(textSentences.size() + titleSentences.size());
combined.addAll(titleSentences);
combined.addAll(textSentences);
return new DocumentLanguageData(
language,
combined,
text);
}
public DocumentSentence extractSentence(String text, EnumSet<HtmlTag> htmlTags) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
public DocumentLanguageData extractSentences(String text) {
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
var sentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
return new DocumentLanguageData(language, sentences, text);
}
public DocumentSentence extractSentence(LanguageDefinition language,
String text,
EnumSet<HtmlTag> htmlTags) {
final Stemmer stemmer = language.stemmer();
var wordsAndSeps = new SentenceSegmentSplitter(language).splitSegment(text, MAX_SENTENCE_LENGTH);
String[] words = wordsAndSeps.words();
BitSet seps = wordsAndSeps.separators();
String[] lc = new String[words.length];
String[] stemmed = new String[words.length];
long[] posTags = language.posTagSentence(words);
BitSet isCapitalized = new BitSet(words.length);
BitSet isAllCaps = new BitSet(words.length);
BitSet includeInStemming;
PosPattern inclusionPattern = stemmer.inclusionPatten();
if (inclusionPattern == null) {
includeInStemming = new BitSet(lc.length);
includeInStemming.set(0, lc.length);
}
else {
includeInStemming = inclusionPattern.matchTagPattern(posTags);
}
for (int i = 0; i < words.length; i++) {
lc[i] = stripPossessive(words[i].toLowerCase());
@@ -134,7 +161,7 @@ public class SentenceExtractor {
}
try {
stemmed[i] = porterStemmer.stem(lc[i]);
stemmed[i] = stemmer.stem(lc[i]);
}
catch (Exception ex) {
stemmed[i] = "NN"; // ???
@@ -144,16 +171,18 @@ public class SentenceExtractor {
return new DocumentSentence(
seps,
lc,
rdrposTagger.tagsForEnSentence(words),
posTags,
stemmed,
htmlTags,
isCapitalized,
isAllCaps
isAllCaps,
includeInStemming
);
}
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
String[] sentences;
public List<DocumentSentence> extractSentencesFromString(LanguageDefinition language, String text, EnumSet<HtmlTag> htmlTags) {
final Stemmer stemmer = language.stemmer();
// Safety net against malformed data DOS attacks,
// found 5+ MB <p>-tags in the wild that just break
@@ -167,7 +196,7 @@ public class SentenceExtractor {
text = normalizeSpaces(text);
// Split into sentences
String[] sentences;
try {
sentences = sentenceDetector.sentDetect(text);
}
@@ -189,22 +218,34 @@ public class SentenceExtractor {
List<DocumentSentence> ret = new ArrayList<>(sentences.length);
SentenceSegmentSplitter sentenceSegmentSplitter = new SentenceSegmentSplitter(language);
if (isNaturalLanguage) {
// Natural language text; do POS tagging and stemming
for (String sent : sentences) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var tokens = wordsAndSeps.words();
var separators = wordsAndSeps.separators();
var posTags = rdrposTagger.tagsForEnSentence(tokens);
var posTags = language.posTagSentence(tokens);
var tokensLc = new String[tokens.length];
var stemmed = new String[tokens.length];
BitSet isCapitalized = new BitSet(tokens.length);
BitSet isAllCaps = new BitSet(tokens.length);
BitSet includeInStemming;
PosPattern inclusionPattern = stemmer.inclusionPatten();
if (inclusionPattern == null) {
includeInStemming = new BitSet(tokens.length);
includeInStemming.set(0, tokens.length);
}
else {
includeInStemming = inclusionPattern.matchTagPattern(posTags);
}
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
if (!tokens[i].isEmpty() && Character.isUpperCase(tokens[i].charAt(0))) {
isCapitalized.set(i);
}
if (StringUtils.isAllUpperCase(tokens[i])) {
@@ -221,13 +262,13 @@ public class SentenceExtractor {
}
try {
stemmed[i] = porterStemmer.stem(tokens[i]);
stemmed[i] = stemmer.stem(tokens[i]);
}
catch (Exception ex) {
stemmed[i] = "NN"; // ???
}
}
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps));
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps, includeInStemming));
}
}
else {
@@ -235,21 +276,22 @@ public class SentenceExtractor {
// as this is not likely to be useful
for (String sent : sentences) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var tokens = wordsAndSeps.words();
var separators = wordsAndSeps.separators();
var posTags = new String[tokens.length];
Arrays.fill(posTags, "X"); // Placeholder POS tag
var posTags = new long[tokens.length];
var tokensLc = new String[tokens.length];
var stemmed = new String[tokens.length];
BitSet isCapitalized = new BitSet(tokens.length);
BitSet isAllCaps = new BitSet(tokens.length);
BitSet includeInStemming = new BitSet(tokens.length);
includeInStemming.set(0, tokens.length);
for (int i = 0; i < tokensLc.length; i++) {
var originalVal = tokens[i];
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
if (!tokens[i].isEmpty() && Character.isUpperCase(tokens[i].charAt(0))) {
isCapitalized.set(i);
}
if (StringUtils.isAllUpperCase(tokens[i])) {
@@ -264,7 +306,7 @@ public class SentenceExtractor {
stemmed[i] = tokensLc[i]; // we don't stem non-language words
}
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized));
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized, includeInStemming));
}
}

View File

@@ -2,7 +2,8 @@ package nu.marginalia.language.sentence;
import com.google.common.base.CharMatcher;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.encoding.UnicodeNormalization;
import nu.marginalia.language.model.LanguageDefinition;
import java.util.ArrayList;
import java.util.BitSet;
@@ -13,10 +14,11 @@ import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;
public class SentenceSegmentSplitter {
private final UnicodeNormalization unicodeNormalization;
public record SeparatedSentence(String[] words, BitSet separators) { }
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
private static final Pattern wordBreakPattern;
static {
@@ -31,13 +33,17 @@ public class SentenceSegmentSplitter {
}
}
SentenceSegmentSplitter(LanguageDefinition languageDefinition) {
this.unicodeNormalization = languageDefinition.unicodeNormalization();
}
/** Split a sentence into words and separators.
*
* @param segment The sentence to split
* @return A list of words and separators
*/
public static SeparatedSentence splitSegment(String segment, int maxLength) {
String flatSegment = AsciiFlattener.flattenUnicode(segment);
public SeparatedSentence splitSegment(String segment, int maxLength) {
String flatSegment = unicodeNormalization.flattenUnicode(segment);
var matcher = wordBreakPattern.matcher(flatSegment);

View File

@@ -3,14 +3,15 @@ package nu.marginalia.language.sentence;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.config.LanguageConfiguration;
@Singleton
public class ThreadLocalSentenceExtractorProvider {
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
@Inject
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
public ThreadLocalSentenceExtractorProvider(LanguageConfiguration languageConfiguration, LanguageModels languageModels) {
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageConfiguration, languageModels));
}
public SentenceExtractor get() {

View File

@@ -0,0 +1,68 @@
package nu.marginalia.language.stemming;
import nu.marginalia.language.pos.PosPattern;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import javax.annotation.Nullable;
public sealed interface Stemmer {
String stem(String input);
@Nullable PosPattern inclusionPatten();
final class Porter implements Stemmer {
private static final ca.rmen.porterstemmer.PorterStemmer porterStemmerImpl = new ca.rmen.porterstemmer.PorterStemmer();
@Nullable
private final PosPattern inclusionPattern;
public Porter(@Nullable PosPattern inclusionPattern) {
this.inclusionPattern = inclusionPattern;
}
@Nullable
public PosPattern inclusionPatten() {
return inclusionPattern;
}
@Override
public String stem(String input) {
return porterStemmerImpl.stemWord(input);
}
}
final class Snowball implements Stemmer {
private final SnowballStemmer snowballStemmer;
@Nullable
private final PosPattern inclusionPattern;
public Snowball(String algorithmName, @Nullable PosPattern inclusionPattern) {
this.inclusionPattern = inclusionPattern;
SnowballStemmer.ALGORITHM algorithm = SnowballStemmer.ALGORITHM.valueOf(algorithmName.toUpperCase());
snowballStemmer = new SnowballStemmer(algorithm);
}
@Nullable
public PosPattern inclusionPatten() {
return inclusionPattern;
}
@Override
public String stem(String input) {
// Snowball impl declares return value as CharSequence,
// but in practice always returns a String
return (String) snowballStemmer.stem(input);
}
}
final class NoOpStemmer implements Stemmer {
@Nullable
public PosPattern inclusionPatten() {
return null;
}
@Override
public String stem(String input) {
return input;
}
}
}

View File

@@ -1,7 +1,6 @@
package nu.marginalia.segmentation;
import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.util.SimpleBlockingThreadPool;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.openzim.ZIMTypes.ZIMFile;
@@ -11,7 +10,7 @@ import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.ForkJoinPool;
public class NgramExtractorMain {
public static void main(String... args) throws IOException, InterruptedException {
@@ -112,50 +111,45 @@ public class NgramExtractorMain {
var orderedHasher = HasherGroup.ordered();
var pool = new SimpleBlockingThreadPool("ngram-extractor",
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
32
);
try (var pool = new ForkJoinPool(Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32))) {
reader.forEachTitles((title) -> {
pool.submitQuietly(() -> {
LongArrayList orderedHashesTitle = new LongArrayList();
reader.forEachTitles((title) -> {
pool.submit(() -> {
LongArrayList orderedHashesTitle = new LongArrayList();
String normalizedTitle = title.replace('_', ' ');
String normalizedTitle = title.replace('_', ' ');
for (var sent : getNgramTitleTerms(normalizedTitle)) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
}
synchronized (lexicon) {
for (var hash : orderedHashesTitle) {
lexicon.incOrderedTitle(hash);
for (var sent : getNgramTitleTerms(normalizedTitle)) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
}
}
synchronized (lexicon) {
for (var hash : orderedHashesTitle) {
lexicon.incOrderedTitle(hash);
}
}
});
});
});
reader.forEachArticles((title, body) -> {
pool.submit(() -> {
LongArrayList orderedHashesBody = new LongArrayList();
reader.forEachArticles((title, body) -> {
pool.submitQuietly(() -> {
LongArrayList orderedHashesBody = new LongArrayList();
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesBody.add(orderedHasher.rollingHash(terms));
}
synchronized (lexicon) {
for (var hash : orderedHashesBody) {
lexicon.incOrderedBody(hash);
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesBody.add(orderedHasher.rollingHash(terms));
}
}
});
}, p -> true);
synchronized (lexicon) {
for (var hash : orderedHashesBody) {
lexicon.incOrderedBody(hash);
}
}
});
pool.shutDown();
pool.awaitTermination(10, TimeUnit.DAYS);
}, p -> true);
}
lexicon.saveCounts(countsOutputFile);
}

View File

@@ -5,16 +5,19 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.LanguageModels;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** Dictionary with term frequency information for (stemmed) words.
*
@@ -38,15 +41,23 @@ public class TermFrequencyDict {
}
private static Long2IntOpenHashMap load(Path file) throws IOException {
try (LongArray array = LongArrayFactory.mmapForReadingConfined(file)) {
try (Arena arena = Arena.ofConfined();
FileChannel fileChannel = (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ)) {
int size = (int) Files.size(file) / 16;
long fileSizeBytes = Files.size(file);
MemorySegment mappedFile = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSizeBytes, arena);
int size = (int) fileSizeBytes / 16;
var ret = new Long2IntOpenHashMap(size, 0.5f);
ret.defaultReturnValue(0);
for (int i = 0; i < size; i++) {
ret.put(array.get(2 * i), (int) array.get(2 * i + 1));
long key = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i);
long val = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i + 1);
ret.put(key, (int) val);
}
return ret;

View File

@@ -0,0 +1,109 @@
<?xml version="1.0"?>
<!DOCTYPE languages [
<!ELEMENT languages (language*,resource*)>
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
<!ELEMENT resource EMPTY>
<!ATTLIST resource
id ID #REQUIRED
md5 CDATA #REQUIRED
path CDATA #REQUIRED
href CDATA #REQUIRED
>
<!ATTLIST language
isoCode ID #REQUIRED
name CDATA #REQUIRED
display (rtl|ltr) #REQUIRED
disabled (true|false) "false"
>
<!ELEMENT unicodeNormalization EMPTY>
<!ATTLIST unicodeNormalization
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
>
<!ELEMENT stemmer (pospattern?)>
<!ATTLIST stemmer
algorithm (porter|snowball|none) #REQUIRED
variant CDATA #IMPLIED
>
<!ELEMENT keywordHash (#PCDATA)>
<!ATTLIST keywordHash
algorithm (asciish|utf8) #REQUIRED
>
<!ELEMENT rdrTagger EMPTY>
<!ATTLIST rdrTagger
dictId IDREF #REQUIRED
rdrId IDREF #REQUIRED
>
<!ELEMENT ngrams (pospattern*)>
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
<!ELEMENT pospattern (#PCDATA)>
<!ELEMENT sentenceDetector EMPTY>
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
]>
<languages>
<language isoCode="en" name="English" display="ltr">
<keywordHash algorithm="asciish" />
<stemmer algorithm="porter">
<pospattern>!(IN TO CC DT)</pospattern>
</stemmer>
<sentenceDetector algorithm="opennlp"/>
<unicodeNormalization algorithm="maximal-latin" />
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
<ngrams type="name">
<pospattern>NNP*</pospattern>
<pospattern>NNP* NNP*</pospattern>
<pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
<pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
</ngrams>
<ngrams type="noun">
<pospattern>VBG</pospattern>
<pospattern>RB VBG</pospattern>
<pospattern>(NNP* JJ)</pospattern>
<pospattern>(NN* JJ) NN*</pospattern>
<pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
<pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
<pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
<pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
<pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
</ngrams>
<ngrams type="subject-suffix">
<pospattern>(VBD VBZ)</pospattern>
<pospattern>MD VB</pospattern>
<pospattern>VBZ DT</pospattern>
<pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
</ngrams>
<ngrams type="title">
<pospattern>!(CC IN DT TO)</pospattern>
<pospattern>!CC !(IN DT TO)</pospattern>
<pospattern>!CC * !(IN DT TO)</pospattern>
<pospattern>!CC * * !(IN DT TO)</pospattern>
</ngrams>
<ngrams type="keyword">
<!-- length = 1 -->
<pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
<!-- length = 2 -->
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
<pospattern>(N* VBG VBN) CD</pospattern>
<!-- length = 3 -->
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
<pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
<pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
<!-- length = 4 -->
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
</ngrams>
</language>
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
</languages>

View File

@@ -0,0 +1,135 @@
<?xml version="1.0"?>
<!DOCTYPE languages [
<!ELEMENT languages (language*,resource*)>
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
<!ELEMENT resource EMPTY>
<!ATTLIST resource
id ID #REQUIRED
md5 CDATA #REQUIRED
path CDATA #REQUIRED
href CDATA #REQUIRED
>
<!ATTLIST language
isoCode ID #REQUIRED
name CDATA #REQUIRED
display (rtl|ltr) #REQUIRED
disabled (true|false) "false"
>
<!ELEMENT unicodeNormalization EMPTY>
<!ATTLIST unicodeNormalization
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
>
<!ELEMENT stemmer (pospattern?)>
<!ATTLIST stemmer
algorithm (porter|snowball|none) #REQUIRED
variant CDATA #IMPLIED
>
<!ELEMENT keywordHash (#PCDATA)>
<!ATTLIST keywordHash
algorithm (asciish|utf8) #REQUIRED
>
<!ELEMENT rdrTagger EMPTY>
<!ATTLIST rdrTagger
dictId IDREF #REQUIRED
rdrId IDREF #REQUIRED
>
<!ELEMENT ngrams (pospattern*)>
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
<!ELEMENT pospattern (#PCDATA)>
<!ELEMENT sentenceDetector EMPTY>
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
]>
<languages>
<language isoCode="en" name="English" display="ltr">
<keywordHash algorithm="asciish" />
<stemmer algorithm="porter">
<pospattern>!(IN TO CC DT)</pospattern>
</stemmer>
<sentenceDetector algorithm="opennlp"/>
<unicodeNormalization algorithm="maximal-latin" />
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
<ngrams type="name">
<pospattern>NNP*</pospattern>
<pospattern>NNP* NNP*</pospattern>
<pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
<pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
</ngrams>
<ngrams type="noun">
<pospattern>VBG</pospattern>
<pospattern>RB VBG</pospattern>
<pospattern>(NNP* JJ)</pospattern>
<pospattern>(NN* JJ) NN*</pospattern>
<pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
<pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
<pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
<pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
<pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
</ngrams>
<ngrams type="subject-suffix">
<pospattern>(VBD VBZ)</pospattern>
<pospattern>MD VB</pospattern>
<pospattern>VBZ DT</pospattern>
<pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
</ngrams>
<ngrams type="title">
<pospattern>!(CC IN DT TO)</pospattern>
<pospattern>!CC !(IN DT TO)</pospattern>
<pospattern>!CC * !(IN DT TO)</pospattern>
<pospattern>!CC * * !(IN DT TO)</pospattern>
</ngrams>
<ngrams type="keyword">
<!-- length = 1 -->
<pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
<!-- length = 2 -->
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
<pospattern>(N* VBG VBN) CD</pospattern>
<!-- length = 3 -->
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
<pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
<pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
<!-- length = 4 -->
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
</ngrams>
</language>
<language isoCode="sv" name="Swedish" display="ltr">
<keywordHash algorithm="asciish" />
<stemmer algorithm="snowball" variant="SWEDISH" />
<sentenceDetector algorithm="opennlp"/>
<unicodeNormalization algorithm="e-accents" />
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
<ngrams type="name">
<pospattern>PROPN</pospattern>
<pospattern>PROPN PROPN</pospattern>
<pospattern>PROPN PROPN PROPN</pospattern>
<pospattern>PROPN PROPN PROPN PROPN</pospattern>
</ngrams>
</language>
<language isoCode="fr" name="French" display="ltr">
<keywordHash algorithm="asciish" />
<stemmer algorithm="snowball" variant="FRENCH" />
<sentenceDetector algorithm="opennlp"/>
</language>
<language isoCode="de" name="German" display="ltr">
<keywordHash algorithm="asciish" />
<stemmer algorithm="snowball" variant="FRENCH" />
<sentenceDetector algorithm="opennlp"/>
<unicodeNormalization algorithm="german" />
</language>
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
<resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />
<resource id="pos-rdr-sv" md5="" path="rdr/Swedish.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.RDR" />
</languages>

View File

@@ -0,0 +1,229 @@
@import nu.marginalia.language.model.WordRep
@import nu.marginalia.language.model.DocumentSentence
@import nu.marginalia.language.model.LanguageDefinition
@import java.util.*
@import java.util.stream.IntStream
@param String textSample
@param LanguageDefinition language
@param List<DocumentSentence> sentences
@param Map<Long, String> tagColors
@param Collection<WordRep> tfIdfReps
@param Collection<WordRep> titleReps
@param Collection<WordRep> nameLikeReps
@param Collection<WordRep> subjectLikeReps
@param Collection<String> artifacts
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>NLP Debug Tool</title>
<script src="https://cdn.tailwindcss.com"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
<style>
.sentence-boundary {
border-left: 3px solid #3b82f6;
}
ruby rt {
font-size: 0.65em;
color: #6b7280;
}
</style>
</head>
<body class="bg-gray-50 min-h-screen">
<div class="container mx-auto px-4 py-8 max-w-6xl">
<!-- Header -->
<div class="mb-8">
<h1 class="text-3xl font-bold text-gray-900 mb-2">
<i class="fas fa-microscope text-blue-600 mr-3"></i>
Language Processing Debug Tool
</h1>
<p class="text-gray-600">Inspect and debug text processing pipeline components</p>
</div>
<!-- Input Section -->
<div class="bg-white rounded-lg shadow-sm border border-gray-200 mb-6">
<form method="post">
<div class="p-4 border-b border-gray-200">
<h2 class="text-lg font-semibold text-gray-900 mb-3">
<i class="fas fa-edit text-green-600 mr-2"></i>
Input Text
</h2>
<textarea name="textSample"
class="w-full p-4 border border-gray-300 rounded-md focus:ring-2 focus:ring-blue-500 focus:border-blue-500 resize-none"
rows="4"
placeholder="Enter your text here to analyze...">${textSample}</textarea>
<div class="flex justify-between items-center mt-3">
<button class="px-4 py-2 bg-blue-600 text-white rounded-md hover:bg-blue-700 transition-colors">
<i class="fas fa-cog mr-2"></i>Analyze
</button>
</div>
</div>
</form>
</div>
<!-- Results Grid -->
<div class="space-y-6">
<!-- Sentence Breakdown with POS Tags -->
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
<div class="p-4 border-b border-gray-200">
<h2 class="text-lg font-semibold text-gray-900">
<i class="fas fa-list-ol text-purple-600 mr-2"></i>
Sentence Breakdown & POS Tags
</h2>
@if (language != null)
<div class="text-sm text-gray-500 mt-1">Auto-detected: ${language.displayName()} (${language.isoCode()})</div>
@endif
</div>
@if (sentences != null)
@for (DocumentSentence sentence : sentences)
<div class="p-4 space-y-4">
<div class="sentence-boundary pl-4 py-4 rounded">
@for (int pos : IntStream.range(0, sentence.length()).toArray())
<ruby class="p-4">
@if (language.hasPosParsing())
<span class="text-xl font-serif ${tagColors.get(sentence.posTags[pos])}">
${sentence.wordsLowerCase[pos]}
</span>
<rt>
${language.decodePosTagName(sentence.posTags[pos])}
@if (sentence.isAllCaps(pos))
<i class="fa-solid fa-angles-up"></i>
@elseif (sentence.isCapitalized(pos))
<i class="fa-solid fa-arrow-up"></i>
@endif
</rt>
@else <!-- pos tags disabled -->
<span class="text-xl font-serif">
${sentence.wordsLowerCase[pos]}
</span>
<rt>
@if (sentence.isAllCaps(pos))
<i class="fa-solid fa-angles-up"></i>
@elseif (sentence.isCapitalized(pos))
<i class="fa-solid fa-arrow-up"></i>
@endif
</rt>
@endif
</ruby>
@if (sentence.isSeparatorComma(pos))
<i class="fa-regular fa-circle"></i>
@endif
@endfor
</div>
</div>
@endfor
@endif
</div>
<!-- Keywords & N-grams -->
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
<div class="p-4 border-b border-gray-200">
<h2 class="text-lg font-semibold text-gray-900">
<i class="fas fa-key text-indigo-600 mr-2"></i>
Keywords & N-grams
</h2>
</div>
<div class="p-4">
<div class="grid grid-cols-1 md:grid-cols-3 gap-6">
<!-- Keywords -->
@if (tfIdfReps != null && !tfIdfReps.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Keywords (TF-IDF)
</h3>
<div class="space-y-2">
@for (WordRep rep : tfIdfReps)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${rep.word}</span>
@if (rep.length > 1)
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
@endif
</div>
@endfor
</div>
</div>
@endif
@if (nameLikeReps != null && !nameLikeReps.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Name-Like
</h3>
<div class="space-y-2">
@for (WordRep rep : nameLikeReps)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${rep.word}</span>
@if (rep.length > 1)
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
@endif
</div>
@endfor
</div>
</div>
@endif
@if (subjectLikeReps != null && !subjectLikeReps.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Subject-Like
</h3>
<div class="space-y-2">
@for (WordRep rep : subjectLikeReps)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${rep.word}</span>
@if (rep.length > 1)
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
@endif
</div>
@endfor
</div>
</div>
@endif
@if (titleReps != null && !titleReps.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Title
</h3>
<div class="space-y-2">
@for (WordRep rep : titleReps)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${rep.word}</span>
@if (rep.length > 1)
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
@endif
</div>
@endfor
</div>
</div>
@endif
@if (artifacts != null && !artifacts.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Title
</h3>
<div class="space-y-2">
@for (String word : artifacts)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${word}</span>
</div>
@endfor
</div>
</div>
@endif
</div>
</div>
</div>
</div>
</div>
</body>
</html>

View File

@@ -1,19 +1,23 @@
package nu.marginalia.keyword;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.dom.DomPruningFilter;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
@@ -23,10 +27,16 @@ import java.util.Set;
class DocumentKeywordExtractorTest {
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
static SentenceExtractor se;
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
}
@Test
public void testKeyboards2() throws IOException, URISyntaxException {
public void testKeyboards2() throws IOException, URISyntaxException, UnsupportedLanguageException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
@@ -44,7 +54,7 @@ class DocumentKeywordExtractorTest {
@Test
public void testMadonna() throws IOException, URISyntaxException {
public void testMadonna() throws IOException, URISyntaxException, UnsupportedLanguageException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
@@ -56,19 +66,19 @@ class DocumentKeywordExtractorTest {
new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
);
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
var keywordsBuilt = keywords.build();
Map<String, Byte> flags = new HashMap<>();
Map<String, CodedSequence> positions = new HashMap<>();
for (int i = 0; i < keywordsBuilt.size(); i++) {
String keyword = keywordsBuilt.keywords.get(i);
byte metadata = keywordsBuilt.metadata[i]
String keyword = keywordsBuilt.keywords().get(i);
byte metadata = keywordsBuilt.metadata()[i]
;
if (Set.of("dirty", "blues").contains(keyword)) {
flags.put(keyword, metadata);
positions.put(keyword, keywordsBuilt.positions.get(i));
positions.put(keyword, keywordsBuilt.positions().get(i));
}
}
@@ -81,17 +91,4 @@ class DocumentKeywordExtractorTest {
);
}
@Test
public void testSpam() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
}
}

View File

@@ -5,14 +5,23 @@ import gnu.trove.list.array.TIntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.keyword.model.DocumentWordSpan;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
@@ -20,8 +29,21 @@ import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
class DocumentPositionMapperTest {
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
private static LanguageDefinition english;
private DocumentPositionMapper positionMapper;
static SentenceExtractor se;
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
english = config.getLanguage("en");
}
@BeforeEach
public void setUp() {
positionMapper = new DocumentPositionMapper();
}
@Test
public void testWordPattern() {
@@ -43,8 +65,8 @@ class DocumentPositionMapperTest {
@Test
public void testBasic() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
DocumentLanguageData dld = new DocumentLanguageData(
se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
DocumentLanguageData dld = new DocumentLanguageData(english,
se.extractSentencesFromString(english, "I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
"I am a teapot"
);
@@ -72,7 +94,7 @@ class DocumentPositionMapperTest {
public void testLinksSingleWord1Rep() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 1 });
@@ -93,7 +115,7 @@ class DocumentPositionMapperTest {
public void testLinksSingleWord2Reps() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
@@ -105,7 +127,7 @@ class DocumentPositionMapperTest {
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());
@@ -121,7 +143,7 @@ class DocumentPositionMapperTest {
public void testLinksTwoWords2Reps() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences = se.extractSentencesFromString(english, "Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences.size());
TIntList counts = new TIntArrayList(new int[] { 4 });
@@ -134,7 +156,7 @@ class DocumentPositionMapperTest {
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());
@@ -151,8 +173,8 @@ class DocumentPositionMapperTest {
public void testLinksTwoSent1Word1Rep() {
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences1 = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
var sentences2 = se.extractSentencesFromString(english, "Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
assertEquals(1, sentences1.size());
assertEquals(1, sentences2.size());
TIntList counts = new TIntArrayList(new int[] { 1, 1 });
@@ -170,7 +192,7 @@ class DocumentPositionMapperTest {
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
assertEquals(2, linkTextSpans.size());
DocumentKeywordsBuilder.DocumentWordSpan span;
DocumentWordSpan span;
span = linkTextSpans.get(0);
assertEquals(6, span.start());

View File

@@ -2,15 +2,22 @@ package nu.marginalia.keyword;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels;
import nu.marginalia.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
@@ -23,9 +30,19 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
class SentenceExtractorTest {
static final LanguageModels lm = TestLanguageModels.getLanguageModels();
static SentenceExtractor se = new SentenceExtractor(lm);
static SentenceExtractor se;
private static LanguageDefinition english;
public static void main(String... args) throws IOException, URISyntaxException {
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
english = config.getLanguage("en");
}
public static void main(String... args) throws IOException, URISyntaxException, UnsupportedLanguageException {
final LanguageModels lm = TestLanguageModels.getLanguageModels();
var data = WmsaHome.getHomePath().resolve("test-data/");
@@ -58,7 +75,7 @@ class SentenceExtractorTest {
@Test
public void testACDC() {
var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
var ret = se.extractSentence(english, "AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
assertEquals("ac/dc", ret.wordsLowerCase[0]);
}

View File

@@ -0,0 +1,28 @@
package nu.marginalia.keyword.extractors;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.util.TestLanguageModels;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import static org.junit.jupiter.api.Assertions.assertTrue;
class ArtifactKeywordsTest {
@Test
public void testExtractArtifacts() throws IOException, ParserConfigurationException, SAXException {
SentenceExtractor se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental()), TestLanguageModels.getLanguageModels());
var artifacts = new ArtifactKeywords(se.extractSentences("Hello I'm <vlofgren@marginalia.nu>, what's up?", "hello!"));
System.out.println(artifacts.getWords());
assertTrue(artifacts.getWords().contains("vlofgren"));
assertTrue(artifacts.getWords().contains("marginalia.nu"));
assertTrue(artifacts.getWords().contains("@marginalia.nu"));
assertTrue(artifacts.getWords().contains("vlofgren@marginalia.nu"));
}
}

View File

@@ -2,13 +2,18 @@ package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.dom.DomPruningFilter;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Collections;
@@ -44,37 +49,43 @@ class NameLikeKeywordsTest {
later known as Augustus, rose to sole power after defeating his opponents in the last civil war of
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
""";
static SentenceExtractor se;
static LanguageConfiguration lc;
static LanguageDefinition en;
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
lc = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
en = lc.getLanguage("en");
}
@Test
public void test() {
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
NameLikeKeywords keywords = new NameLikeKeywords(new KeywordExtractor(), se.extractSentences(text, "Julius Caesar"), 2);
NameLikeKeywords keywords = new NameLikeKeywords(se.extractSentences(text, "Julius Caesar"), 2);
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
Set<String> expected = Set.of("caesar", "senate", "roman", "republic", "roman_republic");
// rome isn't counted because PorterStemmer is derp
System.out.println(actual);
System.out.println(expected);
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
}
@Test
public void testWikiArticle() throws IOException {
public void testWikiArticle() throws IOException, UnsupportedLanguageException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var ke = new KeywordExtractor();
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
var nameWords = new NameLikeKeywords(se.extractSentences(doc), 2);
System.out.println("Names: " + nameWords.words());
}
@Test
public void testWikiArticleP1() {
public void testWikiArticleP1() throws UnsupportedLanguageException {
String html = """
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for clientserver web applications, with a reported 9 million developers.</p>
<p>Java was originally developed by James Gosling at Sun Microsystems. It was released in May 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GPL-2.0-only license. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open-source software and used by most developers and is the default JVM for almost all Linux distributions.</p>
@@ -82,11 +93,7 @@ class NameLikeKeywordsTest {
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var ke = new KeywordExtractor();
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
var nameWords = new NameLikeKeywords(se.extractSentences(doc), 2);
System.out.println("Names: " + nameWords.words());
}
}

View File

@@ -1,12 +1,17 @@
package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels;
import nu.marginalia.util.TestLanguageModels;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
@@ -41,21 +46,28 @@ class SubjectLikeKeywordsTest {
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
""";
static SentenceExtractor se;
@BeforeAll
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
}
@Test
public void test() throws IOException {
var lm = TestLanguageModels.getLanguageModels();
var dict = new TermFrequencyDict(lm);
SentenceExtractor se = new SentenceExtractor(lm);
var dld = se.extractSentences(text, "Julius Caesar");
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, new KeywordExtractor(), dld);
SubjectLikeKeywords keywords = new SubjectLikeKeywords(new KeywordExtractor(),
tfIdfCounts,
dld);
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, dld);
SubjectLikeKeywords keywords = new SubjectLikeKeywords(tfIdfCounts, dld);
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
Set<String> expected = Set.of("republic", "authoritarian_reforms", "political_alliance_that", "power_as_populares", "caesar", "reforms", "populares", "senate", "sole_power_after", "pompey", "civil_wars", "wars", "governmental_reforms", "government_of_the_republic");
Set<String> expected = Set.of("populares", "republic", "authoritarian_reforms", "senate", "pompey", "civil_wars", "octavian", "caesar");
System.out.println(actual);
System.out.println(expected);
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
}

View File

@@ -1,13 +1,18 @@
package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.UnsupportedLanguageException;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.test.util.TestLanguageModels;
import nu.marginalia.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import java.util.stream.Collectors;
@@ -187,12 +192,13 @@ class TitleKeywordsTest {
""";
@Test
public void extractTitleWords() {
var se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
public void extractTitleWords() throws IOException, ParserConfigurationException, SAXException, UnsupportedLanguageException {
var languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental());
var se = new SentenceExtractor(languageConfiguration, TestLanguageModels.getLanguageModels());
var dld = se.extractSentences(Jsoup.parse(document));
var reps = new TitleKeywords(new KeywordExtractor(), dld).getReps();
var reps = new TitleKeywords(dld).getReps();
var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet());
Set<String> expected = Set.of(

View File

@@ -0,0 +1,78 @@
package nu.marginalia.language.config;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.language.filter.TestLanguageModels;
import nu.marginalia.language.pos.PosPattern;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class LanguageConfigurationTestFile {
private static LanguageConfiguration languageConfiguration;
@BeforeAll
public static void setUpAll() throws IOException, SAXException, ParserConfigurationException {
languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental());
}
@Test
void testBasic() {
Assertions.assertNotNull(languageConfiguration.getLanguage("en"));
Assertions.assertNotNull(languageConfiguration.getLanguage("sv"));
Assertions.assertNotNull(languageConfiguration.getLanguage("xx"));
Assertions.assertNull(languageConfiguration.getLanguage("!!"));
}
@Test
public void testStemming() {
var svStemmer = languageConfiguration.getLanguage("sv").stemmer();
var enStemmer = languageConfiguration.getLanguage("en").stemmer();
Assertions.assertNotNull(svStemmer);
Assertions.assertNotNull(enStemmer);
assertEquals("bil", svStemmer.stem("bilar"));
assertEquals("dogged", svStemmer.stem("dogged"));
assertEquals("bilar", enStemmer.stem("bilar"));
assertEquals("dog", enStemmer.stem("dogged"));
}
@Test
public void testPosData() {
var svPos = languageConfiguration.getLanguage("sv").posTagger();
var enPos = languageConfiguration.getLanguage("en").posTagger();
Assertions.assertNotNull(svPos);
Assertions.assertNotNull(enPos);
System.out.println(enPos);
System.out.println(svPos);
Assertions.assertNotEquals(svPos.tagDict, enPos.tagDict);
}
@Test
public void testPosPattern() {
var enPos = languageConfiguration.getLanguage("en").posTagger();
System.out.println(new PosPattern(enPos, "NNP").pattern);
System.out.println(new PosPattern(enPos, "NNP").pattern);
System.out.println(new PosPattern(enPos, "NNP NNPS").pattern);
System.out.println(new PosPattern(enPos, "NNPS (NNPS DT) DT").pattern);
System.out.println(new PosPattern(enPos,
"(NNP NNPS) (NNP NNPS IN DT CC) (NNP NNPS IN DT CC) (NNP NNPS)").pattern);
assertEquals(new PosPattern(enPos, "NNP*").pattern,
new PosPattern(enPos, "(NNP NNPS)").pattern);
assertEquals(LongList.of(0L), new PosPattern(enPos, "Hello").pattern);
assertEquals(0, (new PosPattern(enPos, "(NNP NNPS)").pattern.getFirst() & new PosPattern(enPos, "!(NNP NNPS)").pattern.getFirst()));
assertEquals(new PosPattern(enPos, "(NNP NNPS)").pattern.getFirst().longValue(), new PosPattern(enPos, "*").pattern.getFirst() ^ new PosPattern(enPos, "!(NNP NNPS)").pattern.getFirst());
}
}

View File

@@ -0,0 +1,41 @@
package nu.marginalia.language.encoding;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertSame;
class UnicodeNormalizationTest {
UnicodeNormalization unicodeNormalization = new UnicodeNormalization.FlattenAllLatin();
@Test
void flattenUnicodePlainAscii() {
String s = "abc";
// If the string is ascii, we don't want to allocate a copy
assertSame(s, unicodeNormalization.flattenUnicode(s));
}
@Test
void flattenUnicode() {
String s = "Stülpnagelstraße";
assertEquals("Stulpnagelstrasse", unicodeNormalization.flattenUnicode(s));
}
@Test
void flattenUnicode2() {
String s = "Koncevičius";
assertEquals("Koncevicius", unicodeNormalization.flattenUnicode(s));
}
@Test
void omitNonFlattenable() {
String s = "[アグレッシブ烈子]";
assertEquals("[]", unicodeNormalization.flattenUnicode(s));
}
}

Some files were not shown because too many files have changed in this diff Show More