mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
599 Commits
v24.10.0
...
deploy-022
Author | SHA1 | Date | |
---|---|---|---|
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 | ||
|
9afed0a18e | ||
|
afad4deb94 | ||
|
f071c947e4 | ||
|
79996c9348 | ||
|
db907ab06a | ||
|
c49cd9dd95 | ||
|
eec9df3b0a | ||
|
e5f3288de6 | ||
|
d587544d3a | ||
|
1a9ae1bc40 | ||
|
e0c81e956a | ||
|
542fb12b38 | ||
|
65ec734566 | ||
|
10b6a25c63 | ||
|
6260f6bec7 | ||
|
d6d5467696 | ||
|
034560ca75 | ||
|
e994fddae4 | ||
|
345f01f306 | ||
|
5a8e286689 | ||
|
39a055aa94 | ||
|
37aaa90dc9 | ||
|
24022c5adc | ||
|
1de9ecc0b6 | ||
|
9b80245ea0 | ||
|
4e1595c1a6 | ||
|
0be8585fa5 | ||
|
a0fe070fe7 | ||
|
abe9da0fc6 | ||
|
56d0128b0a | ||
|
840b68ac55 | ||
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
be039d1a8c | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 | ||
|
599534806b | ||
|
7e8253dac7 | ||
|
97a6780ea3 | ||
|
eb634beec8 | ||
|
269ebd1654 | ||
|
39ce40bfeb | ||
|
c187b2e1c1 | ||
|
42eaa4588b | ||
|
4f40a5fbeb | ||
|
3f3d42bc01 | ||
|
61c8d53e1b | ||
|
a7a3d85be9 | ||
|
306232fb54 | ||
|
5aef844f0d | ||
|
d56b5c828a | ||
|
ab58a4636f | ||
|
00be269238 | ||
|
879e6a9424 | ||
|
fba3455732 | ||
|
14283da7f5 | ||
|
93df4d1fc0 | ||
|
b12a0b998c | ||
|
3b6f4e321b | ||
|
8428111771 | ||
|
e9fd4415ef | ||
|
4c95c3dcad | ||
|
c5281536fb | ||
|
4431dae7ac | ||
|
4df4d0a7a8 | ||
|
9f05083b94 | ||
|
fc92e9b9c0 | ||
|
328fb5d927 | ||
|
36889950e8 | ||
|
c96a94878b | ||
|
1c57d7d73a | ||
|
a443d22356 | ||
|
aa59d4afa4 | ||
|
df0f18d0e7 | ||
|
0819d46f97 | ||
|
5e2b63473e | ||
|
f9590703f1 | ||
|
f12fc11337 | ||
|
c309030184 | ||
|
fd5af01629 | ||
|
d4c43c7a79 | ||
|
18700e1919 | ||
|
120b431998 | ||
|
71dad99326 | ||
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 | ||
|
413d5cc788 | ||
|
58539b92ac | ||
|
fe72f16df1 | ||
|
b49a244a2e | ||
|
3f0b4c010f | ||
|
c6e0cd93f7 | ||
|
80a7ccb080 | ||
|
54dec347c4 | ||
|
d6ee3f0785 | ||
|
8be88afcf3 | ||
|
0e3c00d3e1 | ||
|
4279a7f1aa | ||
|
251006d4f9 | ||
|
c3e99dc12a | ||
|
aaaa2de022 | ||
|
fc1388422a | ||
|
b07080db16 | ||
|
e9d86dca4a | ||
|
1d693f0efa | ||
|
5874a163dc | ||
|
5ec7a1deab | ||
|
7fea2808ed | ||
|
8da74484f0 | ||
|
923d5a7234 | ||
|
58f88749b8 | ||
|
77f727a5ba | ||
|
667cfb53dc | ||
|
fe36d4ed20 | ||
|
acf4bef98d | ||
|
2a737c34bb | ||
|
90a577af82 | ||
|
f0c9b935d8 | ||
|
7b5493dd51 | ||
|
c246a59158 | ||
|
0b99781d24 | ||
|
39db9620c1 | ||
|
1781599363 | ||
|
6b2d18fb9b | ||
|
59b1d200ab | ||
|
897010a2cf | ||
|
602af7a77e | ||
|
a7d91c8527 | ||
|
7151602124 | ||
|
884e33bd4a | ||
|
e84d5c497a | ||
|
2d2d3e2466 | ||
|
647dd9b12f | ||
|
de4e2849ce | ||
|
3c43f1954e | ||
|
fa2462ec39 | ||
|
f4ad7145db | ||
|
068b450180 | ||
|
05b909a21f | ||
|
3d179cddce | ||
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce | ||
|
500f63e921 | ||
|
6dfbedda1e | ||
|
9715ddb105 | ||
|
1fc6313a77 | ||
|
b1249d5b8a | ||
|
ef95d59b07 | ||
|
acdd8664f5 | ||
|
6b12eac58a | ||
|
bb3f1f395a | ||
|
b661beef41 | ||
|
9888c47f19 | ||
|
dcef7e955b | ||
|
b3973a1dd7 | ||
|
8bd05d6d90 | ||
|
59df8e356e | ||
|
7161162a35 | ||
|
d7c4c5141f | ||
|
88e9b8fb05 | ||
|
b6265cee11 | ||
|
c91af247e9 | ||
|
7a31227de1 | ||
|
4f477604c5 | ||
|
2970f4395b | ||
|
d1ec909b36 | ||
|
c67c5bbf42 | ||
|
ecb0e57a1a | ||
|
8c61f61b46 | ||
|
662a18c933 | ||
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 | ||
|
0f3b24d0f8 | ||
|
a732095d2a | ||
|
6607f0112f | ||
|
4913730de9 | ||
|
1db64f9d56 | ||
|
4dcff14498 | ||
|
426658f64e | ||
|
2181b22f05 | ||
|
42bd79a609 | ||
|
b91c1e528a | ||
|
b1130d7a04 | ||
|
8364bcdc97 | ||
|
626cab5fab | ||
|
cfd4712191 | ||
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca | ||
|
78cc25584a | ||
|
62ba30bacf | ||
|
3bb84eb206 | ||
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a | ||
|
20ca41ec95 | ||
|
7671f0d9e4 | ||
|
44d6bc71b7 | ||
|
9d302e2973 | ||
|
f553701224 | ||
|
f076d05595 | ||
|
b513809710 | ||
|
7519b28e21 | ||
|
3eac4dd57f | ||
|
4c2810720a | ||
|
8480ba8daa | ||
|
fbba392491 | ||
|
530eb35949 | ||
|
c2dd2175a2 | ||
|
b8581b0f56 | ||
|
2ea34767d8 | ||
|
e9af838231 | ||
|
ae0cad47c4 | ||
|
5fbc8ef998 | ||
|
32c6dd9e6a | ||
|
6ece6a6cfb | ||
|
39cd1c18f8 | ||
|
eb65daaa88 | ||
|
0bebdb6e33 | ||
|
1e50e392c6 | ||
|
fb673de370 | ||
|
eee73ab16c | ||
|
5354e034bf | ||
|
72384ad6ca | ||
|
a2b076f9be | ||
|
c8b0a32c0f | ||
|
f0d74aa3bb | ||
|
74a1f100f4 | ||
|
eb049658e4 | ||
|
db138b2a6f | ||
|
1673fc284c | ||
|
503ea57d5b | ||
|
18ca926c7f | ||
|
db99242db2 | ||
|
2b9d2985ba | ||
|
eeb6ecd711 | ||
|
1f58aeadbf | ||
|
3d68be64da | ||
|
668f3b16ef | ||
|
98a340a0d1 | ||
|
8862100f7e | ||
|
274941f6de | ||
|
abec83582d | ||
|
569520c9b6 | ||
|
088310e998 | ||
|
270cab874b | ||
|
4c74e280d3 | ||
|
5b347e17ac | ||
|
55d6ab933f | ||
|
43b74e9706 | ||
|
579a115243 | ||
|
2c67f50a43 | ||
|
78a958e2b0 | ||
|
4e939389b2 | ||
|
e67a9bdb91 | ||
|
567e4e1237 | ||
|
4342e42722 | ||
|
bc818056e6 | ||
|
de2feac238 | ||
|
1e770205a5 | ||
|
e44ecd6d69 | ||
|
5b93a0e633 | ||
|
08fb0e5efe | ||
|
bcf67782ea | ||
|
ef3f175ede | ||
|
bbe4b5d9fd | ||
|
c67a635103 | ||
|
20b24133fb | ||
|
f2567677e8 | ||
|
bc2c2061f2 | ||
|
1c7f5a31a5 | ||
|
59a8ea60f7 | ||
|
aa9b1244ea | ||
|
2d17233366 | ||
|
b245cc9f38 | ||
|
6614d05bdf | ||
|
55aeb03c4a | ||
|
faa589962f | ||
|
c7edd6b39f | ||
|
79da622e3b | ||
|
3da8337ba6 | ||
|
a32d230f0a | ||
|
3772bfd387 | ||
|
02a7900d1a | ||
|
a1fb92468f | ||
|
b7f0a2a98e | ||
|
5fb76b2e79 | ||
|
ad8c97f342 | ||
|
dc1b6373eb | ||
|
983d6d067c | ||
|
a84a06975c | ||
|
d2864c13ec | ||
|
03ba53ce51 | ||
|
d4a6684931 | ||
|
6f0485287a | ||
|
59e2dd4c26 | ||
|
ca1807caae | ||
|
26c20e18ac | ||
|
7c90b6b414 | ||
|
b63c54c4ce | ||
|
fecd2f4ec3 | ||
|
39e420de88 | ||
|
dc83619861 | ||
|
87d1c89701 | ||
|
a42a7769e2 | ||
|
202bda884f | ||
|
2315fdc731 | ||
|
b5469bd8a1 | ||
|
6a6318d04c | ||
|
55933f8d40 | ||
|
be6382e0d0 | ||
|
45e771f96b | ||
|
8dde502cc9 | ||
|
3e66767af3 | ||
|
9ec9d1b338 | ||
|
dcad0d7863 | ||
|
94e1aa0baf | ||
|
b62f043910 | ||
|
6ea22d0d21 | ||
|
8c69dc31b8 | ||
|
00734ea87f | ||
|
3009713db4 | ||
|
9b2ceaf37c | ||
|
8019c2ce18 | ||
|
a9e312b8b1 | ||
|
4da3563d8a | ||
|
48d0a3089a | ||
|
594df64b20 | ||
|
06efb5abfc | ||
|
78eb1417a7 | ||
|
8c8f2ad5ee | ||
|
f71e79d10f | ||
|
1b27c5cf06 | ||
|
67edc8f90d | ||
|
5f576b7d0c | ||
|
8b05c788fd | ||
|
236f033bc9 | ||
|
510fc75121 | ||
|
0376f2e6e3 | ||
|
0b65164f60 | ||
|
9be477de33 | ||
|
84f55b84ff | ||
|
ab5c30ad51 | ||
|
0c839453c5 | ||
|
5e4c5d03ae | ||
|
710af4999a | ||
|
a5b0a1ae62 | ||
|
e9f71ee39b | ||
|
baeb4a46cd | ||
|
5e2a8e9f27 | ||
|
cc1a5bdf90 | ||
|
7f7b1ffaba | ||
|
0ea8092350 | ||
|
483d29497e | ||
|
bae44497fe | ||
|
0d59202aca | ||
|
0ca43f0c9c | ||
|
3bc99639a0 | ||
|
927bc0b63c | ||
|
d968801dc1 | ||
|
89db69d360 | ||
|
895cee7004 | ||
|
4bb71b8439 | ||
|
e4a41f7dd1 | ||
|
69ad6287b1 | ||
|
81cdd6385d | ||
|
e76c42329f | ||
|
e6ef4734ea | ||
|
41a59dcf45 | ||
|
df4bc1d7e9 | ||
|
2b222efa75 | ||
|
94d4d2edb7 | ||
|
7ae19a92ba | ||
|
56d14e56d7 | ||
|
a557c7ae7f | ||
|
b66879ccb1 | ||
|
f1b7157ca2 | ||
|
7622335e84 | ||
|
0da2047eae | ||
|
5ee4321110 | ||
|
9459b9933b | ||
|
87fb564f89 | ||
|
5ca8523220 | ||
|
1118657ffd | ||
|
b1f970152d | ||
|
e1783891ab | ||
|
64d32471dd | ||
|
232cc465d9 | ||
|
8c963bd4ba | ||
|
6a079c1c75 | ||
|
2dc9f2e639 | ||
|
b66fb9caf6 | ||
|
6d18e6d840 | ||
|
2a3c63f209 | ||
|
9f70cecaef | ||
|
47e58a21c6 | ||
|
3714104976 | ||
|
f6f036b9b1 | ||
|
b510b7feb8 | ||
|
c08203e2ed | ||
|
86497fd32f | ||
|
3b998573fd | ||
|
e161882ec7 | ||
|
357f349e30 | ||
|
e4769f541d | ||
|
2a173e2861 | ||
|
a6a900266c | ||
|
bdba53f055 | ||
|
eb2fe18867 | ||
|
a7468c8d23 | ||
|
fb2beb1eac | ||
|
0fb03e3d62 | ||
|
67db3f295e | ||
|
dafaab3ef7 | ||
|
3f11ca409f | ||
|
694eed79ef | ||
|
4220169119 | ||
|
bbdde789e7 | ||
|
0a53ac68a0 | ||
|
eab61cd48a | ||
|
e65d75a0f9 | ||
|
3b99cffb3d | ||
|
a97c05107e | ||
|
5002870d1f | ||
|
73861e613f | ||
|
0ce2ba9ad9 | ||
|
3ddcebaa36 | ||
|
b91463383e | ||
|
7444a2f36c | ||
|
461bc3eb1a | ||
|
cf7f84f033 | ||
|
fdee07048d | ||
|
2fbf201761 | ||
|
4018e4c434 | ||
|
f3382b5bd8 | ||
|
9fc82574f0 | ||
|
589f4dafb9 | ||
|
c5d657ef98 | ||
|
3c2bb566da | ||
|
9287ee0141 | ||
|
2769c8f869 | ||
|
ddb66f33ba | ||
|
79500b8fbc | ||
|
187eea43a4 | ||
|
a89ed6fa9f | ||
|
e0c0ed27bc | ||
|
20abb91657 | ||
|
291ca8daf1 | ||
|
8d168be138 | ||
|
6e1aa7b391 | ||
|
deab9b9516 | ||
|
39d99a906a | ||
|
6f72e6e0d3 | ||
|
d786d79483 | ||
|
01510f6c2e | ||
|
7ba43e9e3f | ||
|
97bfcd1353 | ||
|
aa3c85c196 | ||
|
ee2d5496d0 | ||
|
5c858a2b94 | ||
|
fb75a3827d | ||
|
7d546d0e2a | ||
|
8fcb6ffd7a | ||
|
f97de0c15a | ||
|
be9e192b78 | ||
|
75ae1c9526 | ||
|
33761a0236 | ||
|
19b69b1764 | ||
|
8b804359a9 | ||
|
f050bf5c4c | ||
|
fdc3efa250 | ||
|
5fdd2c71f8 | ||
|
c97c66a41c | ||
|
7b64377fd6 | ||
|
e11ebf18e5 | ||
|
ba47d72bf4 | ||
|
52bc0272f8 | ||
|
d4bce13a03 | ||
|
b9842b57e0 | ||
|
95776e9bee | ||
|
077d8dcd11 | ||
|
9ec41e27c6 | ||
|
200743c84f | ||
|
6d7998e349 | ||
|
7d1ef08a0f | ||
|
ea6b148df2 | ||
|
3ec9c4c5fa | ||
|
0b6b5dab07 | ||
|
ff17473105 | ||
|
dc5f97e737 | ||
|
d919179ba3 | ||
|
f09669a5b0 | ||
|
b3b0f6fed3 | ||
|
88caca60f9 | ||
|
923ebbac81 | ||
|
df298df852 | ||
|
552b246099 | ||
|
80e6d0069c | ||
|
b941604135 | ||
|
52eb5bc84f | ||
|
4d23fe6261 | ||
|
14519294d2 | ||
|
51e46ad2b0 | ||
|
665c8831a3 | ||
|
47dfbacb00 | ||
|
f94911541a | ||
|
89d8af640d | ||
|
6e4252cf4c | ||
|
79ce4de2ab | ||
|
d6575dfee4 | ||
|
a91ab4c203 | ||
|
6a3079a167 | ||
|
c728a1e2f2 | ||
|
d874d76a09 | ||
|
70bc8831f5 | ||
|
41c11be075 | ||
|
163ce19846 | ||
|
9eb16cb667 | ||
|
af40fa327b | ||
|
cf6d28e71e | ||
|
3791ea1e18 | ||
|
34258b92d1 | ||
|
e5db3f11e1 | ||
|
9f47ce8d15 | ||
|
a5b4951f23 | ||
|
8b8bf0748f | ||
|
5cc71ae586 | ||
|
33fcfe4b63 | ||
|
a31a3b53c4 | ||
|
a456ec9599 | ||
|
a2bc9a98c0 | ||
|
e24a98390c | ||
|
6f858cd627 | ||
|
a293266ccd | ||
|
b8e0dc93d7 | ||
|
d774c39031 | ||
|
ab17af99da | ||
|
b0ac3c586f | ||
|
139fa85b18 | ||
|
bfeb9a4538 | ||
|
3d6c79ae5f | ||
|
c9e9f73ea9 | ||
|
80e482b155 | ||
|
9351593495 | ||
|
d74436f546 | ||
|
76e9053dd0 | ||
|
dbb8bcdd8e | ||
|
7305afa0f8 | ||
|
481f999b70 | ||
|
4b16022556 | ||
|
89dd201a7b | ||
|
ab486323f2 |
1
.github/FUNDING.yml
vendored
1
.github/FUNDING.yml
vendored
@@ -1,5 +1,6 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
polar: marginalia-search
|
||||
github: MarginaliaSearch
|
||||
patreon: marginalia_nu
|
||||
open_collective: # Replace with a single Open Collective username
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,3 +7,4 @@ build/
|
||||
lombok.config
|
||||
Dockerfile
|
||||
run
|
||||
jte-classes
|
83
ROADMAP.md
83
ROADMAP.md
@@ -1,4 +1,4 @@
|
||||
# Roadmap 2024-2025
|
||||
# Roadmap 2025
|
||||
|
||||
This is a roadmap with major features planned for Marginalia Search.
|
||||
|
||||
@@ -8,20 +8,10 @@ be implemented as well.
|
||||
Major goals:
|
||||
|
||||
* Reach 1 billion pages indexed
|
||||
* Improve technical ability of indexing and search. Although this area has improved a bit, the
|
||||
search engine is still not very good at dealing with longer queries.
|
||||
|
||||
## Proper Position Index (COMPLETED 2024-09)
|
||||
|
||||
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
||||
of being very fast to evaluate and works well for what it is, but is inaccurate and has the
|
||||
drawback of making support for quoted search terms inaccurate and largely reliant on indexing
|
||||
word n-grams known beforehand. This limits the ability to interpret longer queries.
|
||||
|
||||
The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
|
||||
list, as is the civilized way of doing this.
|
||||
|
||||
Completed with PR https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99
|
||||
* Improve technical ability of indexing and search. ~~Although this area has improved a bit, the
|
||||
search engine is still not very good at dealing with longer queries.~~ (As of PR [#129](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/129), this has improved significantly. There is still more work to be done )
|
||||
|
||||
## Hybridize crawler w/ Common Crawl data
|
||||
|
||||
@@ -37,8 +27,7 @@ Retaining the ability to independently crawl the web is still strongly desirable
|
||||
|
||||
## Safe Search
|
||||
|
||||
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable
|
||||
to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||
|
||||
## Additional Language Support
|
||||
@@ -49,21 +38,6 @@ associated with each language added, at least a models file or two, as well as s
|
||||
|
||||
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
||||
|
||||
## Finalize RSS support
|
||||
|
||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||
it should be extended to all domains. It would also be interesting to offer search of the
|
||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||
main dataset.
|
||||
|
||||
## Support for binary formats like PDF
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Custom ranking logic
|
||||
|
||||
Stract does an interesting thing where they have configurable search filters.
|
||||
@@ -72,5 +46,50 @@ This looks like a good idea that wouldn't just help clean up the search filters
|
||||
website, but might be cheap enough we might go as far as to offer a number of ad-hoc custom search
|
||||
filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language,
|
||||
which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
|
||||
# Completed
|
||||
|
||||
## Support for binary formats like PDF (COMPLETED 2025-05)
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
||||
PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127)
|
||||
|
||||
## Finalize RSS support (COMPLETED 2024-11)
|
||||
|
||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||
it should be extended to all domains. It would also be interesting to offer search of the
|
||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||
main dataset.
|
||||
|
||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||
|
||||
## Proper Position Index (COMPLETED 2024-09)
|
||||
|
||||
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
||||
of being very fast to evaluate and works well for what it is, but is inaccurate and has the
|
||||
drawback of making support for quoted search terms inaccurate and largely reliant on indexing
|
||||
word n-grams known beforehand. This limits the ability to interpret longer queries.
|
||||
|
||||
The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
|
||||
list, as is the civilized way of doing this.
|
||||
|
||||
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||
|
||||
|
@@ -1,12 +1,11 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
|
||||
id "io.freefair.lombok" version "8.3"
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
@@ -44,11 +43,11 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion=22
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22'
|
||||
jvmVersion = 24
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
jibVersion = '3.4.5'
|
||||
}
|
||||
|
||||
idea {
|
||||
|
@@ -1,17 +1,13 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import lombok.Builder;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
@Builder
|
||||
public class LanguageModels {
|
||||
public final Path termFrequencies;
|
||||
|
||||
public final Path openNLPSentenceDetectionData;
|
||||
public final Path posRules;
|
||||
public final Path posDict;
|
||||
public final Path openNLPTokenData;
|
||||
public final Path fasttextLanguageModel;
|
||||
public final Path segments;
|
||||
|
||||
@@ -19,14 +15,12 @@ public class LanguageModels {
|
||||
Path openNLPSentenceDetectionData,
|
||||
Path posRules,
|
||||
Path posDict,
|
||||
Path openNLPTokenData,
|
||||
Path fasttextLanguageModel,
|
||||
Path segments) {
|
||||
this.termFrequencies = termFrequencies;
|
||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||
this.posRules = posRules;
|
||||
this.posDict = posDict;
|
||||
this.openNLPTokenData = openNLPTokenData;
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
this.segments = segments;
|
||||
}
|
||||
|
@@ -1,3 +1,8 @@
|
||||
package nu.marginalia;
|
||||
|
||||
/**
|
||||
* A record representing a User Agent.
|
||||
* @param uaString - the header value of the User Agent
|
||||
* @param uaIdentifier - what we look for in robots.txt
|
||||
*/
|
||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||
|
@@ -75,6 +75,10 @@ public class WmsaHome {
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static Path getDataPath() {
|
||||
return getHomePath().resolve("data");
|
||||
}
|
||||
|
||||
public static Path getAdsDefinition() {
|
||||
return getHomePath().resolve("data").resolve("adblock.txt");
|
||||
}
|
||||
@@ -100,7 +104,6 @@ public class WmsaHome {
|
||||
home.resolve("model/opennlp-sentence.bin"),
|
||||
home.resolve("model/English.RDR"),
|
||||
home.resolve("model/English.DICT"),
|
||||
home.resolve("model/opennlp-tok.bin"),
|
||||
home.resolve("model/lid.176.ftz"),
|
||||
home.resolve("model/segments.bin")
|
||||
);
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.nodecfg;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -20,10 +21,10 @@ public class NodeConfigurationService {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public NodeConfiguration create(int id, String description, boolean acceptQueries, boolean keepWarcs) throws SQLException {
|
||||
public NodeConfiguration create(int id, String description, boolean acceptQueries, boolean keepWarcs, NodeProfile nodeProfile) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var is = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO NODE_CONFIGURATION(ID, DESCRIPTION, ACCEPT_QUERIES, KEEP_WARCS) VALUES(?, ?, ?, ?)
|
||||
INSERT IGNORE INTO NODE_CONFIGURATION(ID, DESCRIPTION, ACCEPT_QUERIES, KEEP_WARCS, NODE_PROFILE) VALUES(?, ?, ?, ?, ?)
|
||||
""")
|
||||
)
|
||||
{
|
||||
@@ -31,6 +32,7 @@ public class NodeConfigurationService {
|
||||
is.setString(2, description);
|
||||
is.setBoolean(3, acceptQueries);
|
||||
is.setBoolean(4, keepWarcs);
|
||||
is.setString(5, nodeProfile.name());
|
||||
|
||||
if (is.executeUpdate() <= 0) {
|
||||
throw new IllegalStateException("Failed to insert configuration");
|
||||
@@ -43,7 +45,7 @@ public class NodeConfigurationService {
|
||||
public List<NodeConfiguration> getAll() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
""")) {
|
||||
var rs = qs.executeQuery();
|
||||
@@ -58,6 +60,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
));
|
||||
}
|
||||
@@ -72,7 +75,7 @@ public class NodeConfigurationService {
|
||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
WHERE ID=?
|
||||
""")) {
|
||||
@@ -86,6 +89,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
);
|
||||
}
|
||||
@@ -98,7 +102,7 @@ public class NodeConfigurationService {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var us = conn.prepareStatement("""
|
||||
UPDATE NODE_CONFIGURATION
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
WHERE ID=?
|
||||
"""))
|
||||
{
|
||||
@@ -108,7 +112,8 @@ public class NodeConfigurationService {
|
||||
us.setBoolean(4, config.includeInPrecession());
|
||||
us.setBoolean(5, config.keepWarcs());
|
||||
us.setBoolean(6, config.disabled());
|
||||
us.setInt(7, config.node());
|
||||
us.setString(7, config.profile().name());
|
||||
us.setInt(8, config.node());
|
||||
|
||||
if (us.executeUpdate() <= 0)
|
||||
throw new IllegalStateException("Failed to update configuration");
|
||||
|
@@ -6,6 +6,7 @@ public record NodeConfiguration(int node,
|
||||
boolean autoClean,
|
||||
boolean includeInPrecession,
|
||||
boolean keepWarcs,
|
||||
NodeProfile profile,
|
||||
boolean disabled
|
||||
)
|
||||
{
|
||||
|
@@ -0,0 +1,28 @@
|
||||
package nu.marginalia.nodecfg.model;
|
||||
|
||||
public enum NodeProfile {
|
||||
BATCH_CRAWL,
|
||||
REALTIME,
|
||||
MIXED,
|
||||
SIDELOAD;
|
||||
|
||||
public boolean isBatchCrawl() {
|
||||
return this == BATCH_CRAWL;
|
||||
}
|
||||
public boolean isRealtime() {
|
||||
return this == REALTIME;
|
||||
}
|
||||
public boolean isMixed() {
|
||||
return this == MIXED;
|
||||
}
|
||||
public boolean isSideload() {
|
||||
return this == SIDELOAD;
|
||||
}
|
||||
|
||||
public boolean permitBatchCrawl() {
|
||||
return isBatchCrawl() ||isMixed();
|
||||
}
|
||||
public boolean permitSideload() {
|
||||
return isMixed() || isSideload();
|
||||
}
|
||||
}
|
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
@@ -46,8 +47,8 @@ public class NodeConfigurationServiceTest {
|
||||
|
||||
@Test
|
||||
public void test() throws SQLException {
|
||||
var a = nodeConfigurationService.create(1, "Test", false, false);
|
||||
var b = nodeConfigurationService.create(2, "Foo", true, false);
|
||||
var a = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||
var b = nodeConfigurationService.create(2, "Foo", true, false, NodeProfile.MIXED);
|
||||
|
||||
assertEquals(1, a.node());
|
||||
assertEquals("Test", a.description());
|
||||
|
@@ -7,18 +7,24 @@ import com.google.common.util.concurrent.UncheckedExecutionException;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
@Singleton
|
||||
public class DbDomainQueries {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@Inject
|
||||
public DbDomainQueries(HikariDataSource dataSource)
|
||||
@@ -27,27 +33,61 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Integer getDomainId(EdgeDomain domain) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
||||
try {
|
||||
return domainIdCache.get(domain, () -> {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw ex.getCause();
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
|
||||
try {
|
||||
return domainWithNodeCache.get(domain, () -> {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||
@@ -70,22 +110,70 @@ public class DbDomainQueries {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
return OptionalInt.empty();
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<EdgeDomain> getDomain(int id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
EdgeDomain existing = domainNameCache.getIfPresent(id);
|
||||
if (existing != null) {
|
||||
return Optional.of(existing);
|
||||
}
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, id);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeDomain(rsp.getString(1)));
|
||||
var val = new EdgeDomain(rsp.getString(1));
|
||||
domainNameCache.put(id, val);
|
||||
return Optional.of(val);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) throws ExecutionException {
|
||||
String topDomain = domain.topDomain;
|
||||
|
||||
return siblingsCache.get(topDomain, () -> {
|
||||
List<DomainWithNode> ret = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
||||
stmt.setString(1, topDomain);
|
||||
stmt.setInt(2, cnt);
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
var sibling = new EdgeDomain(rs.getString(1));
|
||||
|
||||
if (sibling.equals(domain))
|
||||
continue;
|
||||
|
||||
ret.add(new DomainWithNode(sibling, rs.getInt(2)));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to get domain neighbors");
|
||||
}
|
||||
return ret;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
|
||||
public boolean isIndexed() {
|
||||
return nodeAffinity > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record DomainIdWithNode (int domainId, int nodeAffinity) { }
|
||||
}
|
||||
|
@@ -1,118 +0,0 @@
|
||||
package nu.marginalia.db;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.OptionalInt;
|
||||
|
||||
/** Class used in exporting data. This is intended to be used for a brief time
|
||||
* and then discarded, not kept around as a service.
|
||||
*/
|
||||
public class DbDomainStatsExportMultitool implements AutoCloseable {
|
||||
private final Connection connection;
|
||||
private final int nodeId;
|
||||
private final PreparedStatement knownUrlsQuery;
|
||||
private final PreparedStatement visitedUrlsQuery;
|
||||
private final PreparedStatement goodUrlsQuery;
|
||||
private final PreparedStatement domainNameToId;
|
||||
|
||||
private final PreparedStatement allDomainsQuery;
|
||||
private final PreparedStatement crawlQueueDomains;
|
||||
private final PreparedStatement indexedDomainsQuery;
|
||||
|
||||
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
|
||||
this.connection = dataSource.getConnection();
|
||||
this.nodeId = nodeId;
|
||||
|
||||
knownUrlsQuery = connection.prepareStatement("""
|
||||
SELECT KNOWN_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
visitedUrlsQuery = connection.prepareStatement("""
|
||||
SELECT VISITED_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
goodUrlsQuery = connection.prepareStatement("""
|
||||
SELECT GOOD_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
domainNameToId = connection.prepareStatement("""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
allDomainsQuery = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
""");
|
||||
crawlQueueDomains = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM CRAWL_QUEUE
|
||||
""");
|
||||
indexedDomainsQuery = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
WHERE INDEXED > 0
|
||||
""");
|
||||
}
|
||||
|
||||
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
|
||||
return executeNameToIntQuery(domainName, visitedUrlsQuery);
|
||||
}
|
||||
|
||||
public OptionalInt getDomainId(String domainName) throws SQLException {
|
||||
return executeNameToIntQuery(domainName, domainNameToId);
|
||||
}
|
||||
|
||||
public List<String> getCrawlQueueDomains() throws SQLException {
|
||||
return executeListQuery(crawlQueueDomains, 100);
|
||||
}
|
||||
public List<String> getAllIndexedDomains() throws SQLException {
|
||||
return executeListQuery(indexedDomainsQuery, 100_000);
|
||||
}
|
||||
|
||||
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
|
||||
throws SQLException {
|
||||
statement.setString(1, domainName);
|
||||
var rs = statement.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
return OptionalInt.of(rs.getInt(1));
|
||||
}
|
||||
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
|
||||
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
|
||||
List<String> ret = new ArrayList<>(sizeHint);
|
||||
|
||||
var rs = statement.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
ret.add(rs.getString(1));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws SQLException {
|
||||
knownUrlsQuery.close();
|
||||
goodUrlsQuery.close();
|
||||
visitedUrlsQuery.close();
|
||||
allDomainsQuery.close();
|
||||
crawlQueueDomains.close();
|
||||
domainNameToId.close();
|
||||
connection.close();
|
||||
}
|
||||
}
|
@@ -2,7 +2,6 @@ package nu.marginalia.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.With;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -115,23 +114,23 @@ public class DomainRankingSetsService {
|
||||
}
|
||||
}
|
||||
|
||||
/** Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
/**
|
||||
* Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
*
|
||||
* @param name Key and name of the set
|
||||
* @param name Key and name of the set
|
||||
* @param description Human-readable description
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
* */
|
||||
@With
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
*/
|
||||
public record DomainRankingSet(String name,
|
||||
String description,
|
||||
int depth,
|
||||
String definition)
|
||||
{
|
||||
String definition) {
|
||||
|
||||
public Path fileName(Path base) {
|
||||
return base.resolve(name().toLowerCase() + ".dat");
|
||||
}
|
||||
|
||||
public String[] domains() {
|
||||
return Arrays.stream(definition().split("\n+"))
|
||||
.map(String::trim)
|
||||
@@ -144,5 +143,20 @@ public class DomainRankingSetsService {
|
||||
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
|
||||
}
|
||||
|
||||
public DomainRankingSet withName(String name) {
|
||||
return this.name == name ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDescription(String description) {
|
||||
return this.description == description ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDepth(int depth) {
|
||||
return this.depth == depth ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDefinition(String definition) {
|
||||
return this.definition == definition ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1 @@
|
||||
ALTER TABLE WMSA_prod.NODE_CONFIGURATION ADD COLUMN NODE_PROFILE VARCHAR(255) DEFAULT 'MIXED';
|
@@ -0,0 +1,5 @@
|
||||
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
|
||||
ID INT NOT NULL AUTO_INCREMENT,
|
||||
TIER INT NOT NULL,
|
||||
PRIMARY KEY (ID)
|
||||
);
|
@@ -0,0 +1,213 @@
|
||||
|
||||
-- Create metadata tables for domain ping status and security information
|
||||
|
||||
-- These are not ICMP pings, but rather HTTP(S) pings to check the availability and security
|
||||
-- of web servers associated with domains, to assess uptime and changes in security configurations
|
||||
-- indicating ownership changes or security issues.
|
||||
|
||||
-- Note: DOMAIN_ID and NODE_ID are used to identify the domain and the node that performed the ping.
|
||||
-- These are strictly speaking foreign keys to the EC_DOMAIN table, but as it
|
||||
-- is strictly append-only, we do not need to enforce foreign key constraints.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
SERVER_AVAILABLE BOOLEAN NOT NULL, -- Indicates if the server is available (true) or not (false)
|
||||
SERVER_IP VARBINARY(16), -- IP address of the server (IPv4 or IPv6)
|
||||
SERVER_IP_ASN INTEGER, -- Autonomous System number
|
||||
|
||||
DATA_HASH BIGINT, -- Hash of the data for integrity checks
|
||||
SECURITY_CONFIG_HASH BIGINT, -- Hash of the security configuration for integrity checks
|
||||
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_ETAG VARCHAR(255), -- ETag of the resource as per HTTP headers
|
||||
HTTP_LAST_MODIFIED VARCHAR(255), -- Last modified date of the resource as per HTTP headers
|
||||
HTTP_STATUS INT, -- HTTP status code (e.g., 200, 404, etc.)
|
||||
HTTP_LOCATION VARCHAR(255), -- If the server redirects, this is the location of the redirect
|
||||
HTTP_RESPONSE_TIME_MS SMALLINT UNSIGNED, -- Response time in milliseconds
|
||||
|
||||
ERROR_CLASSIFICATION ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN'), -- Classification of the error if the server is not available
|
||||
ERROR_MESSAGE VARCHAR(255), -- Error message if the server is not available
|
||||
|
||||
TS_LAST_PING TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the last ping
|
||||
TS_LAST_AVAILABLE TIMESTAMP, -- Timestamp of the last time the server was available
|
||||
TS_LAST_ERROR TIMESTAMP, -- Timestamp of the last error encountered
|
||||
|
||||
NEXT_SCHEDULED_UPDATE TIMESTAMP NOT NULL,
|
||||
BACKOFF_CONSECUTIVE_FAILURES INT NOT NULL DEFAULT 0, -- Number of consecutive failures to ping the server
|
||||
BACKOFF_FETCH_INTERVAL INT NOT NULL DEFAULT 60 -- Interval in seconds for the next scheduled ping
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NEXT_SCHEDULED_UPDATE_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, NEXT_SCHEDULED_UPDATE);
|
||||
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
ASN INTEGER, -- Autonomous System Number (ASN) of the server
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_VERSION VARCHAR(10), -- HTTP version used (e.g., HTTP/1.1, HTTP/2)
|
||||
HTTP_COMPRESSION VARCHAR(50), -- Compression method used (e.g., gzip, deflate, br)
|
||||
HTTP_CACHE_CONTROL TEXT, -- Cache control directives from HTTP headers
|
||||
|
||||
SSL_CERT_NOT_BEFORE TIMESTAMP, -- Valid from date (usually same as issued)
|
||||
SSL_CERT_NOT_AFTER TIMESTAMP, -- Valid until date (usually same as expires)
|
||||
|
||||
SSL_CERT_ISSUER VARCHAR(255), -- CA that issued the cert
|
||||
SSL_CERT_SUBJECT VARCHAR(255), -- Certificate subject/CN
|
||||
|
||||
SSL_CERT_PUBLIC_KEY_HASH BINARY(32), -- SHA-256 hash of the public key
|
||||
SSL_CERT_SERIAL_NUMBER VARCHAR(100), -- Unique cert serial number
|
||||
SSL_CERT_FINGERPRINT_SHA256 BINARY(32), -- SHA-256 fingerprint for exact identification
|
||||
SSL_CERT_SAN TEXT, -- Subject Alternative Names (JSON array)
|
||||
SSL_CERT_WILDCARD BOOLEAN, -- Wildcard certificate (*.example.com)
|
||||
|
||||
SSL_PROTOCOL VARCHAR(20), -- TLS 1.2, TLS 1.3, etc.
|
||||
SSL_CIPHER_SUITE VARCHAR(100), -- e.g., TLS_AES_256_GCM_SHA384
|
||||
SSL_KEY_EXCHANGE VARCHAR(50), -- ECDHE, RSA, etc.
|
||||
SSL_CERTIFICATE_CHAIN_LENGTH TINYINT, -- Number of certs in chain
|
||||
|
||||
SSL_CERTIFICATE_VALID BOOLEAN, -- Valid cert chain
|
||||
|
||||
HEADER_CORS_ALLOW_ORIGIN TEXT, -- Could be *, specific domains, or null
|
||||
HEADER_CORS_ALLOW_CREDENTIALS BOOLEAN, -- Credential handling
|
||||
HEADER_CONTENT_SECURITY_POLICY_HASH INT, -- CSP header, hash of the policy
|
||||
HEADER_STRICT_TRANSPORT_SECURITY VARCHAR(255), -- HSTS header
|
||||
HEADER_REFERRER_POLICY VARCHAR(50), -- Referrer handling
|
||||
HEADER_X_FRAME_OPTIONS VARCHAR(50), -- Clickjacking protection
|
||||
HEADER_X_CONTENT_TYPE_OPTIONS VARCHAR(50), -- MIME sniffing protection
|
||||
HEADER_X_XSS_PROTECTION VARCHAR(50), -- XSS protection header
|
||||
|
||||
HEADER_SERVER VARCHAR(255), -- Server header (e.g., Apache, Nginx, etc.)
|
||||
HEADER_X_POWERED_BY VARCHAR(255), -- X-Powered-By header (if present)
|
||||
|
||||
TS_LAST_UPDATE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -- Timestamp of the last SSL check
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_EVENTS (
|
||||
CHANGE_ID BIGINT AUTO_INCREMENT PRIMARY KEY, -- Unique identifier for the change
|
||||
DOMAIN_ID INT NOT NULL, -- Domain ID, used as a foreign key to EC_DOMAIN
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the change
|
||||
|
||||
CHANGE_ASN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to ASN (Autonomous System Number)
|
||||
CHANGE_CERTIFICATE_FINGERPRINT BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate fingerprint
|
||||
CHANGE_CERTIFICATE_PROFILE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate profile (e.g., algorithm, exchange)
|
||||
CHANGE_CERTIFICATE_SAN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate SAN (Subject Alternative Name)
|
||||
CHANGE_CERTIFICATE_PUBLIC_KEY BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate public key
|
||||
CHANGE_SECURITY_HEADERS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to security headers
|
||||
CHANGE_IP_ADDRESS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to IP address
|
||||
CHANGE_SOFTWARE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to the generator (e.g., web server software)
|
||||
OLD_CERT_TIME_TO_EXPIRY INT, -- Time to expiry of the old certificate in hours, if applicable
|
||||
|
||||
SECURITY_SIGNATURE_BEFORE BLOB NOT NULL, -- Security signature before the change, gzipped json record
|
||||
SECURITY_SIGNATURE_AFTER BLOB NOT NULL -- Security signature after the change, gzipped json record
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_EVENTS (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_SECURITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_EVENTS (
|
||||
DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
AVAILABLE BOOLEAN NOT NULL, -- True if the service is available, false if it is not
|
||||
OUTAGE_TYPE ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN') NOT NULL,
|
||||
HTTP_STATUS_CODE INT, -- HTTP status code if available (e.g., 200, 404, etc.)
|
||||
ERROR_MESSAGE VARCHAR(255), -- Specific error details
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- Timestamp of the last update
|
||||
|
||||
AVAILABILITY_RECORD_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (AVAILABILITY_RECORD_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__DOMAIN_ID_TS_IDX ON DOMAIN_AVAILABILITY_EVENTS (DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_AVAILABILITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_INFORMATION (
|
||||
DNS_ROOT_DOMAIN_ID INT AUTO_INCREMENT PRIMARY KEY,
|
||||
ROOT_DOMAIN_NAME VARCHAR(255) NOT NULL UNIQUE,
|
||||
NODE_AFFINITY INT NOT NULL, -- Node ID that performs the DNS check, assign randomly across nodes
|
||||
|
||||
DNS_A_RECORDS TEXT, -- JSON array of IPv4 addresses
|
||||
DNS_AAAA_RECORDS TEXT, -- JSON array of IPv6 addresses
|
||||
DNS_CNAME_RECORD VARCHAR(255), -- Canonical name (if applicable)
|
||||
DNS_MX_RECORDS TEXT, -- JSON array of mail exchange records
|
||||
DNS_CAA_RECORDS TEXT, -- Certificate Authority Authorization
|
||||
DNS_TXT_RECORDS TEXT, -- TXT records (SPF, DKIM, verification, etc.)
|
||||
DNS_NS_RECORDS TEXT, -- Name servers (JSON array)
|
||||
DNS_SOA_RECORD TEXT, -- Start of Authority (JSON object)
|
||||
|
||||
TS_LAST_DNS_CHECK TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
TS_NEXT_DNS_CHECK TIMESTAMP NOT NULL,
|
||||
DNS_CHECK_PRIORITY TINYINT DEFAULT 0 -- Priority of the DNS check, in case we want to schedule a refresh sooner
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_INFORMATION__PRIORITY_NEXT_CHECK_IDX ON DOMAIN_DNS_INFORMATION (NODE_AFFINITY, DNS_CHECK_PRIORITY DESC, TS_NEXT_DNS_CHECK);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_EVENTS (
|
||||
DNS_ROOT_DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- DNS change type flags
|
||||
CHANGE_A_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv4 address changes
|
||||
CHANGE_AAAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv6 address changes
|
||||
CHANGE_CNAME BOOLEAN NOT NULL DEFAULT FALSE, -- CNAME changes
|
||||
CHANGE_MX_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Mail server changes
|
||||
CHANGE_CAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Certificate authority changes
|
||||
CHANGE_TXT_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- TXT record changes (SPF, DKIM, etc.)
|
||||
CHANGE_NS_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Name server changes (big red flag!)
|
||||
CHANGE_SOA_RECORD BOOLEAN NOT NULL DEFAULT FALSE, -- Start of Authority changes
|
||||
|
||||
DNS_SIGNATURE_BEFORE BLOB NOT NULL, -- Compressed JSON snapshot of DNS records before change
|
||||
DNS_SIGNATURE_AFTER BLOB NOT NULL, -- Compressed JSON snapshot of DNS records after change
|
||||
|
||||
DNS_EVENT_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (DNS_EVENT_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__DNS_ROOT_DOMAIN_ID_TS_IDX ON DOMAIN_DNS_EVENTS (DNS_ROOT_DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__TS_CHANGE_IDX ON DOMAIN_DNS_EVENTS (TS_CHANGE);
|
@@ -0,0 +1,24 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
public enum DocumentFormat {
|
||||
PLAIN(0, 1, "text"),
|
||||
PDF(0, 1, "pdf"),
|
||||
UNKNOWN(0, 1, "???"),
|
||||
HTML123(0, 1, "html"),
|
||||
HTML4(-0.1, 1.05, "html"),
|
||||
XHTML(-0.1, 1.05, "html"),
|
||||
HTML5(0.5, 1.1, "html");
|
||||
|
||||
/** Used to tune quality score */
|
||||
public final double offset;
|
||||
/** Used to tune quality score */
|
||||
public final double scale;
|
||||
public final String shortFormat;
|
||||
|
||||
DocumentFormat(double offset, double scale, String shortFormat) {
|
||||
this.offset = offset;
|
||||
this.scale = scale;
|
||||
this.shortFormat = shortFormat;
|
||||
}
|
||||
|
||||
}
|
@@ -1,15 +1,12 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import lombok.*;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeDomain implements Serializable {
|
||||
|
||||
@Nonnull
|
||||
@@ -17,8 +14,7 @@ public class EdgeDomain implements Serializable {
|
||||
@Nonnull
|
||||
public final String topDomain;
|
||||
|
||||
@SneakyThrows
|
||||
public EdgeDomain(String host) {
|
||||
public EdgeDomain(@Nonnull String host) {
|
||||
Objects.requireNonNull(host, "domain name must not be null");
|
||||
|
||||
host = host.toLowerCase();
|
||||
@@ -34,28 +30,23 @@ public class EdgeDomain implements Serializable {
|
||||
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
int dot2 = host.substring(0, dot).lastIndexOf('.');
|
||||
if (dot2 < 0) {
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
if (looksLikeGovTld(host))
|
||||
{ // Capture .ac.jp, .co.uk
|
||||
} else {
|
||||
if (looksLikeGovTld(host)) { // Capture .ac.jp, .co.uk
|
||||
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
||||
if (dot3 >= 0) {
|
||||
dot2 = dot3;
|
||||
subDomain = host.substring(0, dot2);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
subDomain = host.substring(0, dot2);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
@@ -64,6 +55,16 @@ public class EdgeDomain implements Serializable {
|
||||
}
|
||||
|
||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(id|ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||
|
||||
public EdgeDomain(@Nonnull String subDomain, @Nonnull String topDomain) {
|
||||
this.subDomain = subDomain;
|
||||
this.topDomain = topDomain;
|
||||
}
|
||||
|
||||
public static String getTopDomain(String host) {
|
||||
return new EdgeDomain(host).topDomain;
|
||||
}
|
||||
|
||||
private boolean looksLikeGovTld(String host) {
|
||||
if (host.length() < 8)
|
||||
return false;
|
||||
@@ -91,11 +92,11 @@ public class EdgeDomain implements Serializable {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public EdgeUrl toRootUrlHttp() {
|
||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||
return new EdgeUrl("http", this, null, "/", null);
|
||||
}
|
||||
|
||||
public EdgeUrl toRootUrlHttps() {
|
||||
return new EdgeUrl("https", this, null, "/", null);
|
||||
}
|
||||
@@ -111,31 +112,16 @@ public class EdgeDomain implements Serializable {
|
||||
return topDomain;
|
||||
}
|
||||
|
||||
public String getDomainKey() {
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
return topDomain;
|
||||
/** If possible, try to provide an alias domain,
|
||||
* i.e. a domain name that is very likely to link to this one
|
||||
* */
|
||||
public Optional<EdgeDomain> aliasDomain() {
|
||||
if (subDomain.equals("www")) {
|
||||
return Optional.of(new EdgeDomain("", topDomain));
|
||||
} else if (subDomain.isBlank()){
|
||||
return Optional.of(new EdgeDomain("www", topDomain));
|
||||
}
|
||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||
}
|
||||
|
||||
public String getLongDomainKey() {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
ret.append(topDomain);
|
||||
}
|
||||
else {
|
||||
ret.append(topDomain, 0, cutPoint);
|
||||
}
|
||||
|
||||
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
|
||||
ret.append(":");
|
||||
ret.append(subDomain);
|
||||
}
|
||||
|
||||
return ret.toString().toLowerCase();
|
||||
else return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
@@ -155,16 +141,14 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
if (govListTest.test(topDomain)) {
|
||||
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
dot = topDomain.lastIndexOf('.');
|
||||
}
|
||||
|
||||
|
||||
if (dot < 0 || dot == topDomain.length() - 1) {
|
||||
return "-";
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return topDomain.substring(dot + 1);
|
||||
}
|
||||
}
|
||||
@@ -174,10 +158,10 @@ public class EdgeDomain implements Serializable {
|
||||
if (!(o instanceof EdgeDomain other)) return false;
|
||||
final String this$subDomain = this.getSubDomain();
|
||||
final String other$subDomain = other.getSubDomain();
|
||||
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
|
||||
if (!Objects.equals(this$subDomain, other$subDomain)) return false;
|
||||
final String this$domain = this.getTopDomain();
|
||||
final String other$domain = other.getTopDomain();
|
||||
if (!Objects.equals(this$domain,other$domain)) return false;
|
||||
if (!Objects.equals(this$domain, other$domain)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -191,4 +175,13 @@ public class EdgeDomain implements Serializable {
|
||||
return result;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
public String getSubDomain() {
|
||||
return this.subDomain;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
public String getTopDomain() {
|
||||
return this.topDomain;
|
||||
}
|
||||
}
|
||||
|
@@ -1,21 +1,15 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import nu.marginalia.util.QueryParams;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.Serializable;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeUrl implements Serializable {
|
||||
public final String proto;
|
||||
public final EdgeDomain domain;
|
||||
@@ -37,10 +31,9 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
private static URI parseURI(String url) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(urlencodeFixer(url));
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException(STR."Failed to parse URI '\{url}'", ex.getMessage());
|
||||
return EdgeUriFactory.parseURILenient(url);
|
||||
} catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,61 +49,6 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
||||
|
||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
||||
|
||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
||||
and what you get is more like what's on the inside, we try to patch things instead,
|
||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
||||
like bad or missing URLEncoding
|
||||
*/
|
||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
||||
var s = new StringBuilder();
|
||||
String goodChars = "&.?:/-;+$#";
|
||||
String hexChars = "0123456789abcdefABCDEF";
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return url + "/";
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
for (int i = pathIdx; i < end; i++) {
|
||||
int c = url.charAt(i);
|
||||
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else if (c == '%' && i+2<end) {
|
||||
int cn = url.charAt(i+1);
|
||||
int cnn = url.charAt(i+2);
|
||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else {
|
||||
s.append("%25");
|
||||
}
|
||||
}
|
||||
else {
|
||||
s.append(String.format("%%%02X", c));
|
||||
}
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
private static int findPathIdx(String url) throws URISyntaxException {
|
||||
int colonIdx = url.indexOf(':');
|
||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking protocol");
|
||||
}
|
||||
return url.indexOf('/', colonIdx+2);
|
||||
}
|
||||
|
||||
public EdgeUrl(URI URI) {
|
||||
try {
|
||||
@@ -125,8 +63,7 @@ public class EdgeUrl implements Serializable {
|
||||
this.proto = URI.getScheme().toLowerCase();
|
||||
this.port = port(URI.getPort(), proto);
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Failed to parse " + URI);
|
||||
throw ex;
|
||||
}
|
||||
@@ -145,8 +82,7 @@ public class EdgeUrl implements Serializable {
|
||||
this.proto = URL.getProtocol().toLowerCase();
|
||||
this.port = port(URL.getPort(), proto);
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URL.getQuery());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Failed to parse " + URL);
|
||||
throw ex;
|
||||
}
|
||||
@@ -158,8 +94,7 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
if (protocol.equals("http") && port == 80) {
|
||||
return null;
|
||||
}
|
||||
else if (protocol.equals("https") && port == 443) {
|
||||
} else if (protocol.equals("https") && port == 443) {
|
||||
return null;
|
||||
}
|
||||
return port;
|
||||
@@ -177,11 +112,32 @@ public class EdgeUrl implements Serializable {
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
EdgeUriFactory.urlencodePath(sb, path);
|
||||
|
||||
if (param != null) {
|
||||
EdgeUriFactory.urlencodeQuery(sb, param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public String toDisplayString() {
|
||||
StringBuilder sb = new StringBuilder(256);
|
||||
|
||||
sb.append(proto);
|
||||
sb.append("://");
|
||||
sb.append(domain);
|
||||
|
||||
if (port != null) {
|
||||
sb.append(':');
|
||||
sb.append(port);
|
||||
}
|
||||
|
||||
sb.append(path);
|
||||
|
||||
if (param != null) {
|
||||
sb.append('?');
|
||||
sb.append(param);
|
||||
sb.append('?').append(param);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
@@ -190,12 +146,13 @@ public class EdgeUrl implements Serializable {
|
||||
public String dir() {
|
||||
return path.replaceAll("/[^/]+$", "/");
|
||||
}
|
||||
|
||||
public String fileName() {
|
||||
return path.replaceAll(".*/", "");
|
||||
}
|
||||
|
||||
public int depth() {
|
||||
return (int) path.chars().filter(c -> c=='/').count();
|
||||
return (int) path.chars().filter(c -> c == '/').count();
|
||||
}
|
||||
|
||||
public EdgeUrl withPathAndParam(String path, String param) {
|
||||
@@ -207,8 +164,8 @@ public class EdgeUrl implements Serializable {
|
||||
if (other == this) return true;
|
||||
if (other instanceof EdgeUrl e) {
|
||||
return Objects.equals(e.domain, domain)
|
||||
&& Objects.equals(e.path, path)
|
||||
&& Objects.equals(e.param, param);
|
||||
&& Objects.equals(e.path, path)
|
||||
&& Objects.equals(e.param, param);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -235,8 +192,7 @@ public class EdgeUrl implements Serializable {
|
||||
public URL asURL() throws MalformedURLException {
|
||||
try {
|
||||
return asURI().toURL();
|
||||
}
|
||||
catch (URISyntaxException e) {
|
||||
} catch (URISyntaxException e) {
|
||||
throw new MalformedURLException(e.getMessage());
|
||||
}
|
||||
}
|
||||
@@ -248,4 +204,254 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
return new URI(this.proto, this.domain.toString(), this.path, this.param, null);
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
||||
public String getProto() {
|
||||
return this.proto;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class EdgeUriFactory {
|
||||
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||
|
||||
if (shouldOmitUrlencodeRepair(url)) {
|
||||
try {
|
||||
return new URI(url);
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
// ignore and run the lenient parser
|
||||
}
|
||||
}
|
||||
|
||||
var s = new StringBuilder(url.length()+8);
|
||||
|
||||
int pathIdx = findPathIdx(url);
|
||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||
return new URI(url + "/");
|
||||
}
|
||||
s.append(url, 0, pathIdx);
|
||||
|
||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||
int end = url.indexOf("#");
|
||||
if (end < 0) end = url.length();
|
||||
|
||||
int queryIdx = url.indexOf('?');
|
||||
if (queryIdx < 0) queryIdx = end;
|
||||
|
||||
urlencodePath(s, url.substring(pathIdx, queryIdx));
|
||||
if (queryIdx < end) {
|
||||
urlencodeQuery(s, url.substring(queryIdx + 1, end));
|
||||
}
|
||||
return new URI(s.toString());
|
||||
}
|
||||
|
||||
/** Break apart the path element of an URI into its components, and then
|
||||
* urlencode any component that needs it, and recombine it into a single
|
||||
* path element again.
|
||||
*/
|
||||
public static void urlencodePath(StringBuilder sb, String path) {
|
||||
if (path == null || path.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] pathParts = StringUtils.split(path, '/');
|
||||
if (pathParts.length == 0) {
|
||||
sb.append('/');
|
||||
return;
|
||||
}
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(pathPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (String pathPart : pathParts) {
|
||||
if (pathPart.isEmpty()) continue;
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
sb.append('/');
|
||||
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||
} else {
|
||||
sb.append('/');
|
||||
sb.append(pathPart);
|
||||
}
|
||||
}
|
||||
|
||||
if (path.endsWith("/")) {
|
||||
sb.append('/');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Break apart the query element of a URI into its components, and then
|
||||
* urlencode any component that needs it, and recombine it into a single
|
||||
* query element again.
|
||||
*/
|
||||
public static void urlencodeQuery(StringBuilder sb, String param) {
|
||||
if (param == null || param.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] queryParts = StringUtils.split(param, '&');
|
||||
|
||||
boolean shouldUrlEncode = false;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (needsUrlEncode(queryPart)) {
|
||||
shouldUrlEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
boolean first = true;
|
||||
for (String queryPart : queryParts) {
|
||||
if (queryPart.isEmpty()) continue;
|
||||
|
||||
if (first) {
|
||||
sb.append('?');
|
||||
first = false;
|
||||
} else {
|
||||
sb.append('&');
|
||||
}
|
||||
|
||||
if (shouldUrlEncode) {
|
||||
int idx = queryPart.indexOf('=');
|
||||
if (idx < 0) {
|
||||
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||
} else {
|
||||
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
|
||||
sb.append('=');
|
||||
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
|
||||
}
|
||||
} else {
|
||||
sb.append(queryPart);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Test if the url element needs URL encoding.
|
||||
* <p></p>
|
||||
* Note we may have been given an already encoded path element,
|
||||
* so we include % and + in the list of good characters
|
||||
*/
|
||||
static boolean needsUrlEncode(String urlElement) {
|
||||
for (int i = 0; i < urlElement.length(); i++) {
|
||||
char c = urlElement.charAt(i);
|
||||
|
||||
if (isUrlSafe(c)) continue;
|
||||
if ("+".indexOf(c) >= 0) continue;
|
||||
if (c == '%' && i + 2 < urlElement.length()) {
|
||||
char c1 = urlElement.charAt(i + 1);
|
||||
char c2 = urlElement.charAt(i + 2);
|
||||
if (isHexDigit(c1) && isHexDigit(c2)) {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static boolean isUrlSafe(int c) {
|
||||
if (c >= 'a' && c <= 'z') return true;
|
||||
if (c >= 'A' && c <= 'Z') return true;
|
||||
if (c >= '0' && c <= '9') return true;
|
||||
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Test if the URL is a valid URL that does not need to be
|
||||
* urlencoded.
|
||||
* <p></p>
|
||||
* This is a very simple heuristic test that does not guarantee
|
||||
* that the URL is valid, but it will identify cases where we
|
||||
* are fairly certain that the URL does not need encoding,
|
||||
* so we can skip a bunch of allocations and string operations
|
||||
* that would otherwise be needed to fix the URL.
|
||||
*/
|
||||
static boolean shouldOmitUrlencodeRepair(String url) {
|
||||
int idx = 0;
|
||||
final int len = url.length();
|
||||
|
||||
// Validate the scheme
|
||||
while (idx < len - 2) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == ':') break;
|
||||
if (!isAsciiAlphabetic(c)) return false;
|
||||
}
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
if (url.charAt(idx++) != '/') return false;
|
||||
|
||||
// Validate the authority
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '/') break;
|
||||
if (c == ':') continue;
|
||||
if (c == '@') continue;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
// Validate the path
|
||||
if (idx >= len) return true;
|
||||
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '?') break;
|
||||
if (c == '/') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
if (idx >= len) return true;
|
||||
|
||||
// Validate the query
|
||||
while (idx < len) {
|
||||
char c = url.charAt(idx++);
|
||||
if (c == '&') continue;
|
||||
if (c == '=') continue;
|
||||
if (c == '#') return true;
|
||||
if (!isUrlSafe(c)) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private static boolean isAsciiAlphabetic(int c) {
|
||||
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
private static boolean isHexDigit(int c) {
|
||||
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
/** Find the index of the path element in a URL.
|
||||
* <p></p>
|
||||
* The path element starts after the scheme and authority part of the URL,
|
||||
* which is everything up to and including the first slash after the colon.
|
||||
*/
|
||||
private static int findPathIdx(String url) throws URISyntaxException {
|
||||
int colonIdx = url.indexOf(':');
|
||||
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking scheme");
|
||||
}
|
||||
return url.indexOf('/', colonIdx + 3);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -16,6 +16,9 @@ public enum HtmlFeature {
|
||||
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
||||
LONG_URL("special:longurl"),
|
||||
|
||||
CLOUDFLARE_FEATURE("special:cloudflare"),
|
||||
CDN_FEATURE("special:cdn"),
|
||||
|
||||
VIEWPORT("special:viewport"),
|
||||
|
||||
COOKIES("special:cookies"),
|
||||
@@ -25,6 +28,8 @@ public enum HtmlFeature {
|
||||
|
||||
GA_SPAM("special:gaspam"),
|
||||
|
||||
PDF("format:pdf"),
|
||||
|
||||
/** For fingerprinting and ranking */
|
||||
OPENGRAPH("special:opengraph"),
|
||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||
@@ -60,6 +65,8 @@ public enum HtmlFeature {
|
||||
DOFOLLOW_LINK("special:dofollow"),
|
||||
APPLE_TOUCH_ICON("special:appleicon"),
|
||||
|
||||
S3_FEATURE("special:s3"),
|
||||
|
||||
UNKNOWN("special:uncategorized");
|
||||
|
||||
|
||||
|
@@ -6,11 +6,20 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Instant;
|
||||
|
||||
public class GsonFactory {
|
||||
public static Gson get() {
|
||||
return new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toEpochMilli()))
|
||||
.registerTypeAdapter(Instant.class, (JsonDeserializer<Instant>) (json, typeOfT, context) -> {
|
||||
if (json.isJsonPrimitive() && json.getAsJsonPrimitive().isNumber()) {
|
||||
return Instant.ofEpochMilli(json.getAsLong());
|
||||
} else {
|
||||
throw new JsonParseException("Expected a number for Instant");
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||
|
@@ -1,22 +0,0 @@
|
||||
package nu.marginalia.model.html;
|
||||
|
||||
// This class really doesn't belong anywhere, but will squat here for now
|
||||
public enum HtmlStandard {
|
||||
PLAIN(0, 1),
|
||||
UNKNOWN(0, 1),
|
||||
HTML123(0, 1),
|
||||
HTML4(-0.1, 1.05),
|
||||
XHTML(-0.1, 1.05),
|
||||
HTML5(0.5, 1.1);
|
||||
|
||||
/** Used to tune quality score */
|
||||
public final double offset;
|
||||
/** Used to tune quality score */
|
||||
public final double scale;
|
||||
|
||||
HtmlStandard(double offset, double scale) {
|
||||
this.offset = offset;
|
||||
this.scale = scale;
|
||||
}
|
||||
|
||||
}
|
@@ -9,7 +9,7 @@ public enum DocumentFlags {
|
||||
GeneratorForum,
|
||||
GeneratorWiki,
|
||||
Sideloaded,
|
||||
Unused7,
|
||||
PdfFile,
|
||||
Unused8,
|
||||
;
|
||||
|
||||
|
@@ -83,6 +83,11 @@ public class QueryParams {
|
||||
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||
return param.startsWith("project=") || param.startsWith("story=");
|
||||
}
|
||||
|
||||
// www.perseus.tufts.edu:
|
||||
if (param.startsWith("collection=")) return true;
|
||||
if (param.startsWith("doc=")) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class EdgeDomainTest {
|
||||
|
||||
@Test
|
||||
public void testSkepdic() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
|
||||
assertEquals("skepdic", domain.getDomain().getDomainKey());
|
||||
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
|
||||
assertEquals("skepdic", domain2.getDomain().getDomainKey());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHkDomain() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
@@ -21,25 +21,70 @@ class EdgeUrlTest {
|
||||
new EdgeUrl("https://memex.marginalia.nu/#here")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParam() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
}
|
||||
@Test
|
||||
void urlencodeFixer() throws URISyntaxException {
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
||||
void testUriFromString() throws URISyntaxException {
|
||||
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
|
||||
// converting it back to a string, we want to ensure there is no changes along the way.
|
||||
|
||||
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
|
||||
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
|
||||
|
||||
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
|
||||
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
|
||||
|
||||
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
|
||||
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||
|
||||
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
|
||||
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
|
||||
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
|
||||
|
||||
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||
|
||||
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParms() throws URISyntaxException {
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
|
||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
|
||||
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
|
||||
|
||||
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
|
||||
|
||||
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
|
||||
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
|
||||
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
|
||||
|
||||
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
|
||||
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
|
||||
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
|
||||
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
|
||||
}
|
||||
}
|
@@ -1,7 +0,0 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
public interface ProcessAdHocTaskHeartbeat extends AutoCloseable {
|
||||
void progress(String step, int progress, int total);
|
||||
|
||||
void close();
|
||||
}
|
@@ -1,52 +0,0 @@
|
||||
package nu.marginalia.process.log;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
|
||||
class WorkLoadIterable<T> implements Iterable<T> {
|
||||
|
||||
private final Path logFile;
|
||||
private final Function<WorkLogEntry, Optional<T>> mapper;
|
||||
|
||||
WorkLoadIterable(Path logFile, Function<WorkLogEntry, Optional<T>> mapper) {
|
||||
this.logFile = logFile;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public Iterator<T> iterator() {
|
||||
var stream = Files.lines(logFile);
|
||||
return new Iterator<>() {
|
||||
final Iterator<T> iter = stream
|
||||
.filter(WorkLogEntry::isJobId)
|
||||
.map(WorkLogEntry::parse)
|
||||
.map(mapper)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (iter.hasNext()) {
|
||||
return true;
|
||||
} else {
|
||||
stream.close();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iter.next();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
@@ -1,4 +0,0 @@
|
||||
# Process
|
||||
|
||||
Basic functionality for a Process. Processes must include this dependency to ensure
|
||||
their loggers are configured properly!
|
@@ -1,9 +0,0 @@
|
||||
log4j2.isThreadContextMapInheritable=true
|
||||
status = info
|
||||
appender.console.type = Console
|
||||
appender.console.name = LogToConsole
|
||||
appender.console.layout.type = PatternLayout
|
||||
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
|
||||
appender.console.filter.http.type = MarkerFilter
|
||||
rootLogger.level = info
|
||||
rootLogger.appenderRef.console.ref = LogToConsole
|
@@ -4,12 +4,12 @@ import com.github.jknack.handlebars.*;
|
||||
import com.github.jknack.handlebars.helper.ConditionalHelpers;
|
||||
import com.github.jknack.handlebars.io.ClassPathTemplateLoader;
|
||||
import com.github.jknack.handlebars.io.TemplateLoader;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@@ -42,22 +42,35 @@ public class MustacheRenderer<T> {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public String render(T model) {
|
||||
return template.apply(model);
|
||||
try {
|
||||
return template.apply(model);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public <T2> String render(T model, String name, List<T2> children) {
|
||||
Context ctx = Context.newBuilder(model).combine(name, children).build();
|
||||
|
||||
return template.apply(ctx);
|
||||
try {
|
||||
return template.apply(ctx);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public String render(T model, Map<String, ?> children) {
|
||||
Context ctx = Context.newBuilder(model).combine(children).build();
|
||||
return template.apply(ctx);
|
||||
|
||||
try {
|
||||
return template.apply(ctx);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -42,6 +42,12 @@ dependencies {
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.flyway
|
||||
|
||||
libs.bundles.jooby.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'org.slf4j'
|
||||
}
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia;
|
||||
package nu.marginalia.process;
|
||||
|
||||
import java.util.UUID;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia;
|
||||
package nu.marginalia.process;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
@@ -0,0 +1,102 @@
|
||||
package nu.marginalia.process;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||
import nu.marginalia.service.ConfigLoader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public abstract class ProcessMainClass {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
|
||||
|
||||
private final MessageQueueFactory messageQueueFactory;
|
||||
private final int node;
|
||||
private final String inboxName;
|
||||
|
||||
static {
|
||||
// Load global config ASAP
|
||||
ConfigLoader.loadConfig(
|
||||
ConfigLoader.getConfigPath("system")
|
||||
);
|
||||
}
|
||||
|
||||
private final Gson gson;
|
||||
|
||||
public ProcessMainClass(MessageQueueFactory messageQueueFactory,
|
||||
ProcessConfiguration config,
|
||||
Gson gson,
|
||||
String inboxName
|
||||
) {
|
||||
this.gson = gson;
|
||||
new org.mariadb.jdbc.Driver();
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.node = config.node();
|
||||
this.inboxName = inboxName;
|
||||
}
|
||||
|
||||
|
||||
protected <T> Instructions<T> fetchInstructions(Class<T> requestType) throws Exception {
|
||||
|
||||
var inbox = messageQueueFactory.createSingleShotInbox(inboxName, node, UUID.randomUUID());
|
||||
|
||||
logger.info("Waiting for instructions");
|
||||
|
||||
var msgOpt = getMessage(inbox, requestType.getSimpleName());
|
||||
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
||||
|
||||
// for live crawl, request is empty for now
|
||||
T request = gson.fromJson(msg.payload(), requestType);
|
||||
|
||||
return new Instructions<>(msg, inbox, request);
|
||||
}
|
||||
|
||||
|
||||
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws InterruptedException, SQLException {
|
||||
var opt = inbox.waitForMessage(30, TimeUnit.SECONDS);
|
||||
if (opt.isPresent()) {
|
||||
if (!opt.get().function().equals(expectedFunction)) {
|
||||
throw new RuntimeException("Unexpected function: " + opt.get().function());
|
||||
}
|
||||
return opt;
|
||||
}
|
||||
else {
|
||||
var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction));
|
||||
stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage));
|
||||
return stolenMessage;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected static class Instructions<T> {
|
||||
private final MqMessage message;
|
||||
private final MqSingleShotInbox inbox;
|
||||
private final T value;
|
||||
Instructions(MqMessage message, MqSingleShotInbox inbox, T value)
|
||||
{
|
||||
this.message = message;
|
||||
this.inbox = inbox;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public T value() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void ok() {
|
||||
inbox.sendResponse(message, MqInboxResponse.ok());
|
||||
}
|
||||
public void err() {
|
||||
inbox.sendResponse(message, MqInboxResponse.err());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -3,6 +3,8 @@ package nu.marginalia.process.control;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
/** Dummy implementation of ProcessHeartbeat that does nothing */
|
||||
public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class);
|
||||
@@ -30,6 +32,11 @@ public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
logger.info("Progress: {}, {}/{}", step, progress, total);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> Iterable<T> wrap(String step, Collection<T> collection) {
|
||||
return collection;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
public interface ProcessAdHocTaskHeartbeat extends AutoCloseable {
|
||||
void progress(String step, int progress, int total);
|
||||
|
||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||
<T> Iterable<T> wrap(String step, Collection<T> collection);
|
||||
|
||||
void close();
|
||||
}
|
@@ -2,11 +2,13 @@ package nu.marginalia.process.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@@ -57,16 +59,42 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
int lastProgress = this.progress;
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
if (this.progress / 10 != lastProgress / 10) {
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
}
|
||||
|
||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||
@Override
|
||||
public <T> Iterable<T> wrap(String step, Collection<T> collection) {
|
||||
return () -> new Iterator<>() {
|
||||
private final Iterator<T> base = collection.iterator();
|
||||
private final int size = collection.size();
|
||||
private final int updateInterval = Math.max(1, size / 100); // update every 1% of the collection, or at least once
|
||||
private int pos = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
boolean ret = base.hasNext();
|
||||
if (!ret) {
|
||||
progress(step, size, size);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
// update every 1% of the collection, to avoid hammering the database with updates
|
||||
if (pos++ % updateInterval == 0) {
|
||||
progress(step, pos, size);
|
||||
}
|
||||
return base.next();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
@@ -185,6 +213,5 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,59 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
@Singleton
|
||||
public class ProcessEventLog {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessEventLog.class);
|
||||
|
||||
private final String serviceName;
|
||||
private final UUID instanceUuid;
|
||||
private final String serviceBase;
|
||||
|
||||
@Inject
|
||||
public ProcessEventLog(HikariDataSource dataSource, ProcessConfiguration configuration) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.serviceName = configuration.processName() + ":" + configuration.node();
|
||||
this.instanceUuid = configuration.instanceUuid();
|
||||
this.serviceBase = configuration.processName();
|
||||
|
||||
logger.info("Starting service {} instance {}", serviceName, instanceUuid);
|
||||
|
||||
logEvent("PCS-START", serviceName);
|
||||
}
|
||||
|
||||
public void logEvent(Class<?> type, String message) {
|
||||
logEvent(type.getSimpleName(), message);
|
||||
}
|
||||
public void logEvent(String type, String message) {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, serviceName);
|
||||
stmt.setString(2, serviceBase);
|
||||
stmt.setString(3, instanceUuid.toString());
|
||||
stmt.setString(4, type);
|
||||
stmt.setString(5, Objects.requireNonNull(message, ""));
|
||||
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to log event {}:{}", type, message);
|
||||
}
|
||||
}
|
||||
}
|
@@ -4,17 +4,18 @@ package nu.marginalia.process.control;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This service sends a heartbeat to the database every 5 seconds.
|
||||
*/
|
||||
@Singleton
|
||||
public class ProcessHeartbeatImpl implements ProcessHeartbeat {
|
||||
public class ProcessHeartbeatImpl implements ProcessHeartbeat, Closeable {
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeatImpl.class);
|
||||
private final String processName;
|
||||
private final String processBase;
|
||||
@@ -169,5 +170,9 @@ public class ProcessHeartbeatImpl implements ProcessHeartbeat {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,7 @@ package nu.marginalia.process.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@@ -0,0 +1,56 @@
|
||||
package nu.marginalia.process.log;
|
||||
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
|
||||
class WorkLoadIterable<T> implements Iterable<T> {
|
||||
|
||||
private final Path logFile;
|
||||
private final Function<WorkLogEntry, Optional<T>> mapper;
|
||||
|
||||
WorkLoadIterable(Path logFile, Function<WorkLogEntry, Optional<T>> mapper) {
|
||||
this.logFile = logFile;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<T> iterator() {
|
||||
try {
|
||||
var stream = Files.lines(logFile);
|
||||
return new Iterator<>() {
|
||||
final Iterator<T> iter = stream
|
||||
.filter(WorkLogEntry::isJobId)
|
||||
.map(WorkLogEntry::parse)
|
||||
.map(mapper)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (iter.hasNext()) {
|
||||
return true;
|
||||
} else {
|
||||
stream.close();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iter.next();
|
||||
}
|
||||
};
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.*;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
|
||||
/** WorkLog is a journal of work done by a process,
|
||||
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
|
||||
return new WorkLoadIterable<>(logFile, mapper);
|
||||
}
|
||||
|
||||
public static int countEntries(Path crawlerLog) throws IOException{
|
||||
try (var linesStream = Files.lines(crawlerLog)) {
|
||||
return (int) linesStream.filter(WorkLogEntry::isJobId).count();
|
||||
}
|
||||
}
|
||||
|
||||
// Use synchro over concurrent set to avoid competing writes
|
||||
// - correct is better than fast here, it's sketchy enough to use
|
||||
// a PrintWriter
|
@@ -33,6 +33,6 @@ public record WorkLogEntry(String id, String ts, String path, int cnt) {
|
||||
|
||||
String relPath = fileName();
|
||||
|
||||
return STR."\{relPath.substring(0, 2)}/\{relPath.substring(2, 4)}/\{relPath}";
|
||||
return relPath.substring(0, 2) + "/" + relPath.substring(2, 4) + "/" + relPath;
|
||||
}
|
||||
}
|
@@ -9,11 +9,11 @@ import java.util.Properties;
|
||||
|
||||
public class ConfigLoader {
|
||||
|
||||
static Path getConfigPath(String configName) {
|
||||
public static Path getConfigPath(String configName) {
|
||||
return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties");
|
||||
}
|
||||
|
||||
static void loadConfig(Path configPath) {
|
||||
public static void loadConfig(Path configPath) {
|
||||
if (!Files.exists(configPath)) {
|
||||
System.err.println("No config file found at " + configPath);
|
||||
return;
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.service;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -26,7 +25,6 @@ public class NodeConfigurationWatcher {
|
||||
watcherThread.start();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void pollConfiguration() {
|
||||
for (;;) {
|
||||
List<Integer> goodNodes = new ArrayList<>();
|
||||
@@ -34,7 +32,7 @@ public class NodeConfigurationWatcher {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT ID FROM NODE_CONFIGURATION
|
||||
WHERE ACCEPT_QUERIES AND NOT DISABLED
|
||||
WHERE ACCEPT_QUERIES AND NOT DISABLED
|
||||
""");
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
@@ -47,7 +45,12 @@ public class NodeConfigurationWatcher {
|
||||
|
||||
queryNodes = goodNodes;
|
||||
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,20 +0,0 @@
|
||||
package nu.marginalia.service;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public abstract class ProcessMainClass {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
|
||||
|
||||
static {
|
||||
// Load global config ASAP
|
||||
ConfigLoader.loadConfig(
|
||||
ConfigLoader.getConfigPath("system")
|
||||
);
|
||||
}
|
||||
|
||||
public ProcessMainClass() {
|
||||
new org.mariadb.jdbc.Driver();
|
||||
}
|
||||
|
||||
}
|
@@ -12,7 +12,11 @@ public enum ServiceId {
|
||||
Control("control-service"),
|
||||
|
||||
Dating("dating-service"),
|
||||
Explorer("explorer-service");
|
||||
Status("setatus-service"),
|
||||
Explorer("explorer-service"),
|
||||
|
||||
NOT_A_SERVICE("NOT_A_SERVICE")
|
||||
;
|
||||
|
||||
public final String serviceName;
|
||||
|
||||
|
@@ -4,13 +4,13 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.ManagedChannel;
|
||||
import io.grpc.ManagedChannelBuilder;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
import nu.marginalia.service.NodeConfigurationWatcher;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.function.Function;
|
||||
@@ -48,7 +48,12 @@ public class GrpcChannelPoolFactory {
|
||||
public <STUB> GrpcSingleNodeChannelPool<STUB> createSingle(ServiceKey<? extends PartitionTraits.Unicast> key,
|
||||
Function<ManagedChannel, STUB> stubConstructor)
|
||||
{
|
||||
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
|
||||
try {
|
||||
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private ManagedChannel createChannel(InstanceAddress route) {
|
||||
|
@@ -1,18 +1,18 @@
|
||||
package nu.marginalia.service.client;
|
||||
|
||||
import io.grpc.ManagedChannel;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.NodeConfigurationWatcher;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
|
||||
@@ -22,14 +22,13 @@ import java.util.function.Function;
|
||||
public class GrpcMultiNodeChannelPool<STUB> {
|
||||
private final ConcurrentHashMap<Integer, GrpcSingleNodeChannelPool<STUB>> pools =
|
||||
new ConcurrentHashMap<>();
|
||||
private static final Logger logger = LoggerFactory.getLogger(GrpcMultiNodeChannelPool.class);
|
||||
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private final ServiceKey<? extends PartitionTraits.Multicast> serviceKey;
|
||||
private final Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor;
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
|
||||
@SneakyThrows
|
||||
public GrpcMultiNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<ServicePartition.Multi> serviceKey,
|
||||
Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor,
|
||||
@@ -52,11 +51,16 @@ public class GrpcMultiNodeChannelPool<STUB> {
|
||||
}
|
||||
|
||||
private GrpcSingleNodeChannelPool<STUB> newSingleChannelPool(int node) {
|
||||
return new GrpcSingleNodeChannelPool<>(
|
||||
serviceRegistryIf,
|
||||
serviceKey.forPartition(ServicePartition.partition(node)),
|
||||
channelConstructor,
|
||||
stubConstructor);
|
||||
try {
|
||||
return new GrpcSingleNodeChannelPool<>(
|
||||
serviceRegistryIf,
|
||||
serviceKey.forPartition(ServicePartition.partition(node)),
|
||||
channelConstructor,
|
||||
stubConstructor);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the list of nodes that are eligible for broadcast-style requests */
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.service.client;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import io.grpc.ManagedChannel;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@@ -11,10 +10,14 @@ import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
@@ -25,18 +28,19 @@ import java.util.function.Function;
|
||||
public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
private final Map<InstanceAddress, ConnectionHolder> channels = new ConcurrentHashMap<>();
|
||||
|
||||
private final Marker grpcMarker = MarkerFactory.getMarker("GRPC");
|
||||
private static final Logger logger = LoggerFactory.getLogger(GrpcSingleNodeChannelPool.class);
|
||||
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private final Function<InstanceAddress, ManagedChannel> channelConstructor;
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
|
||||
Function<InstanceAddress, ManagedChannel> channelConstructor,
|
||||
Function<ManagedChannel, STUB> stubConstructor) {
|
||||
Function<ManagedChannel, STUB> stubConstructor)
|
||||
throws Exception
|
||||
{
|
||||
super(serviceKey);
|
||||
|
||||
this.serviceRegistryIf = serviceRegistryIf;
|
||||
@@ -46,8 +50,6 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
serviceRegistryIf.registerMonitor(this);
|
||||
|
||||
onChange();
|
||||
|
||||
awaitChannel(Duration.ofSeconds(5));
|
||||
}
|
||||
|
||||
|
||||
@@ -60,10 +62,10 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
for (var route : Sets.symmetricDifference(oldRoutes, newRoutes)) {
|
||||
ConnectionHolder oldChannel;
|
||||
if (newRoutes.contains(route)) {
|
||||
logger.info("Adding route {}", route);
|
||||
logger.info(grpcMarker, "Adding route {} => {}", serviceKey, route);
|
||||
oldChannel = channels.put(route, new ConnectionHolder(route));
|
||||
} else {
|
||||
logger.info("Expelling route {}", route);
|
||||
logger.info(grpcMarker, "Expelling route {} => {}", serviceKey, route);
|
||||
oldChannel = channels.remove(route);
|
||||
}
|
||||
if (oldChannel != null) {
|
||||
@@ -101,7 +103,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
|
||||
try {
|
||||
logger.info("Creating channel for {}:{}", serviceKey, address);
|
||||
logger.info(grpcMarker, "Creating channel for {} => {}", serviceKey, address);
|
||||
value = channelConstructor.apply(address);
|
||||
if (channel.compareAndSet(null, value)) {
|
||||
return value;
|
||||
@@ -112,7 +114,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error(STR."Failed to get channel for \{address}", e);
|
||||
logger.error(grpcMarker, "Failed to get channel for " + address, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -204,7 +206,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
|
||||
for (var e : exceptions) {
|
||||
logger.error("Failed to call service {}", serviceKey, e);
|
||||
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
|
||||
}
|
||||
|
||||
throw new ServiceNotAvailableException(serviceKey);
|
||||
|
@@ -4,6 +4,11 @@ import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
|
||||
public class ServiceNotAvailableException extends RuntimeException {
|
||||
public ServiceNotAvailableException(ServiceKey<?> key) {
|
||||
super(STR."Service \{key} not available");
|
||||
super(key.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public StackTraceElement[] getStackTrace() { // Suppress stack trace
|
||||
return new StackTraceElement[0];
|
||||
}
|
||||
}
|
||||
|
@@ -11,4 +11,14 @@ public class FakeServiceHeartbeat implements ServiceHeartbeat {
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServiceAdHocTaskHeartbeat createServiceAdHocTaskHeartbeat(String taskName) {
|
||||
return new ServiceAdHocTaskHeartbeat() {
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {}
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,7 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
public interface ServiceAdHocTaskHeartbeat extends AutoCloseable {
|
||||
void progress(String step, int progress, int total);
|
||||
|
||||
void close();
|
||||
}
|
@@ -0,0 +1,187 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This object sends a heartbeat to the database every few seconds,
|
||||
* updating with the progress of a task within a service. Progress is tracked by providing
|
||||
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
|
||||
* order as the steps in the task in order to get an accurate progress tracking.
|
||||
*/
|
||||
public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHocTaskHeartbeat {
|
||||
private final Logger logger = LoggerFactory.getLogger(ServiceAdHocTaskHeartbeatImpl.class);
|
||||
private final String taskName;
|
||||
private final String taskBase;
|
||||
private final int node;
|
||||
private final String instanceUUID;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
|
||||
private final String serviceInstanceUUID;
|
||||
private int progress;
|
||||
|
||||
private volatile boolean running = false;
|
||||
private volatile String step = "-";
|
||||
|
||||
ServiceAdHocTaskHeartbeatImpl(ServiceConfiguration configuration,
|
||||
String taskName,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node();
|
||||
this.taskBase = configuration.serviceName() + "." + taskName;
|
||||
this.node = configuration.node();
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = UUID.randomUUID().toString();
|
||||
this.serviceInstanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
heartbeatInit();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
runnerThread.start();
|
||||
}
|
||||
|
||||
/** Update the progress of the task. This is a fast function that doesn't block;
|
||||
* the actual update is done in a separate thread.
|
||||
*
|
||||
* @param step The current step in the task.
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
int lastProgress = this.progress;
|
||||
this.step = step;
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
if (this.progress / 10 != lastProgress / 10) {
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
while (running) {
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, NODE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
|
||||
VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
SERVICE_INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'STARTING'
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, taskName);
|
||||
stmt.setString(2, taskBase);
|
||||
stmt.setInt(3, node);
|
||||
stmt.setString(4, instanceUUID);
|
||||
stmt.setString(5, serviceInstanceUUID);
|
||||
stmt.setString(6, instanceUUID);
|
||||
stmt.setString(7, serviceInstanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("ServiceHeartbeat failed to initialize", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'RUNNING',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString(2, step);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS='STOPPED',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString( 2, step);
|
||||
stmt.setString( 3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,4 +5,5 @@ import com.google.inject.ImplementedBy;
|
||||
@ImplementedBy(ServiceHeartbeatImpl.class)
|
||||
public interface ServiceHeartbeat {
|
||||
<T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName);
|
||||
ServiceAdHocTaskHeartbeat createServiceAdHocTaskHeartbeat(String taskName);
|
||||
}
|
||||
|
@@ -54,6 +54,11 @@ public class ServiceHeartbeatImpl implements ServiceHeartbeat {
|
||||
return new ServiceTaskHeartbeatImpl<>(steps, configuration, processName, eventLog, dataSource);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServiceAdHocTaskHeartbeat createServiceAdHocTaskHeartbeat(String taskName) {
|
||||
return new ServiceAdHocTaskHeartbeatImpl(configuration, taskName, dataSource);
|
||||
}
|
||||
|
||||
|
||||
public void start() {
|
||||
if (!running) {
|
||||
|
@@ -1,17 +1,23 @@
|
||||
package nu.marginalia.service.discovery;
|
||||
|
||||
import nu.marginalia.service.discovery.monitor.*;
|
||||
import com.google.inject.ImplementedBy;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
/** A service registry that allows services to register themselves and
|
||||
* be discovered by other services on the network.
|
||||
*/
|
||||
@ImplementedBy(ZkServiceRegistry.class)
|
||||
public interface ServiceRegistryIf {
|
||||
/**
|
||||
* Register a service with the registry.
|
||||
@@ -57,4 +63,9 @@ public interface ServiceRegistryIf {
|
||||
* </ul>
|
||||
* */
|
||||
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
||||
|
||||
void registerProcess(String processName, int nodeId);
|
||||
void deregisterProcess(String processName, int nodeId);
|
||||
void watchProcess(String processName, int nodeId, Consumer<Boolean> callback) throws Exception;
|
||||
void watchProcessAnyNode(String processName, Collection<Integer> nodes, BiConsumer<Boolean, Integer> callback) throws Exception;
|
||||
}
|
||||
|
@@ -2,11 +2,8 @@ package nu.marginalia.service.discovery;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.discovery.monitor.*;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.CuratorFramework;
|
||||
import org.apache.curator.utils.ZKPaths;
|
||||
@@ -18,6 +15,10 @@ import org.slf4j.LoggerFactory;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
/** A versatile service registry that uses ZooKeeper to store service endpoints.
|
||||
* It is used to register services and to look up the endpoints of other services.
|
||||
@@ -37,18 +38,22 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
private final List<String> livenessPaths = new ArrayList<>();
|
||||
|
||||
@Inject
|
||||
@SneakyThrows
|
||||
public ZkServiceRegistry(CuratorFramework curatorFramework) {
|
||||
this.curatorFramework = curatorFramework;
|
||||
try {
|
||||
this.curatorFramework = curatorFramework;
|
||||
|
||||
curatorFramework.start();
|
||||
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
|
||||
throw new IllegalStateException("Failed to connect to zookeeper after 30s");
|
||||
curatorFramework.start();
|
||||
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
|
||||
throw new IllegalStateException("Failed to connect to zookeeper after 30s");
|
||||
}
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(
|
||||
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException("Failed to start ZkServiceRegistry", ex);
|
||||
}
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(
|
||||
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -59,8 +64,8 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
{
|
||||
var endpoint = new ServiceEndpoint(externalAddress, requestPort(externalAddress, key));
|
||||
|
||||
String path = STR."\{key.toPath()}/\{instanceUUID.toString()}";
|
||||
byte[] payload = STR."\{endpoint.host()}:\{endpoint.port()}".getBytes(StandardCharsets.UTF_8);
|
||||
String path = key.toPath() + "/" + instanceUUID.toString();
|
||||
byte[] payload = (endpoint.host() + ":" + endpoint.port()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
logger.info("Registering {} -> {}", path, endpoint);
|
||||
|
||||
@@ -72,14 +77,18 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
return endpoint;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void declareFirstBoot() {
|
||||
if (!isFirstBoot()) {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.PERSISTENT)
|
||||
.forPath("/first-boot");
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.PERSISTENT)
|
||||
.forPath("/first-boot");
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to declare first-boot", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +118,7 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
@Override
|
||||
public void announceInstance(UUID instanceUUID) {
|
||||
try {
|
||||
String serviceRoot = STR."/running-instances/\{instanceUUID.toString()}";
|
||||
String serviceRoot = "/running-instances/" + instanceUUID.toString();
|
||||
|
||||
livenessPaths.add(serviceRoot);
|
||||
|
||||
@@ -128,7 +137,7 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
*/
|
||||
public boolean isInstanceRunning(UUID instanceUUID) {
|
||||
try {
|
||||
String serviceRoot = STR."/running-instances/\{instanceUUID.toString()}";
|
||||
String serviceRoot = "/running-instances/" + instanceUUID.toString();
|
||||
return null != curatorFramework.checkExists().forPath(serviceRoot);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@@ -165,11 +174,11 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.EPHEMERAL)
|
||||
.forPath(STR."/port-registry/\{externalHost}/\{port}", payload);
|
||||
.forPath("/port-registry/" + externalHost + "/" + port, payload);
|
||||
return port;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error(STR."Still negotiating port for \{identifier}");
|
||||
logger.error("Still negotiating port for " + identifier);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -246,6 +255,90 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
.forPath("/running-instances");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void registerProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.EPHEMERAL)
|
||||
.forPath(path);
|
||||
livenessPaths.add(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to register process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deregisterProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.delete().forPath(path);
|
||||
livenessPaths.remove(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to deregister process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void watchProcess(String processName, int nodeId, Consumer<Boolean> callback) throws Exception {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
|
||||
// first check if the path exists and call the callback accordingly
|
||||
|
||||
if (curatorFramework.checkExists().forPath(path) != null) {
|
||||
callback.accept(true);
|
||||
}
|
||||
else {
|
||||
callback.accept(false);
|
||||
}
|
||||
|
||||
curatorFramework.watchers().add()
|
||||
.usingWatcher((Watcher) change -> {
|
||||
Watcher.Event.EventType type = change.getType();
|
||||
|
||||
if (type == Watcher.Event.EventType.NodeCreated) {
|
||||
callback.accept(true);
|
||||
}
|
||||
if (type == Watcher.Event.EventType.NodeDeleted) {
|
||||
callback.accept(false);
|
||||
}
|
||||
})
|
||||
.forPath(path);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void watchProcessAnyNode(String processName, Collection<Integer> nodes, BiConsumer<Boolean, Integer> callback) throws Exception {
|
||||
|
||||
for (int node : nodes) {
|
||||
String path = "/process-locks/" + processName + "/" + node;
|
||||
|
||||
// first check if the path exists and call the callback accordingly
|
||||
if (curatorFramework.checkExists().forPath(path) != null) {
|
||||
callback.accept(true, node);
|
||||
}
|
||||
else {
|
||||
callback.accept(false, node);
|
||||
}
|
||||
|
||||
curatorFramework.watchers().add()
|
||||
.usingWatcher((Watcher) change -> {
|
||||
Watcher.Event.EventType type = change.getType();
|
||||
|
||||
if (type == Watcher.Event.EventType.NodeCreated) {
|
||||
callback.accept(true, node);
|
||||
}
|
||||
if (type == Watcher.Event.EventType.NodeDeleted) {
|
||||
callback.accept(false, node);
|
||||
}
|
||||
})
|
||||
.forPath(path);
|
||||
}
|
||||
}
|
||||
|
||||
/* Exposed for tests */
|
||||
public synchronized void shutDown() {
|
||||
if (stopped)
|
||||
|
@@ -48,5 +48,10 @@ public record ServiceEndpoint(String host, int port) {
|
||||
public int port() {
|
||||
return endpoint.port();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return endpoint().host() + ":" + endpoint.port() + " [" + instance + "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -38,7 +38,7 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
|
||||
record Rest(String name) implements ServiceKey<ServicePartition.None> {
|
||||
public String toPath() {
|
||||
return STR."/services/rest/\{name}";
|
||||
return "/services/rest/" + name;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -48,13 +48,26 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
{
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final String shortName;
|
||||
|
||||
int periodIndex = name.lastIndexOf('.');
|
||||
|
||||
if (periodIndex >= 0) shortName = name.substring(periodIndex+1);
|
||||
else shortName = name;
|
||||
|
||||
return "rest:" + shortName;
|
||||
}
|
||||
|
||||
}
|
||||
record Grpc<P extends ServicePartition>(String name, P partition) implements ServiceKey<P> {
|
||||
public String baseName() {
|
||||
return STR."/services/grpc/\{name}";
|
||||
return "/services/grpc/" + name;
|
||||
}
|
||||
public String toPath() {
|
||||
return STR."/services/grpc/\{name}/\{partition.identifier()}";
|
||||
return "/services/grpc/" + name + "/" + partition.identifier();
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -64,6 +77,18 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
{
|
||||
return new Grpc<>(name, partition);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final String shortName;
|
||||
|
||||
int periodIndex = name.lastIndexOf('.');
|
||||
|
||||
if (periodIndex >= 0) shortName = name.substring(periodIndex+1);
|
||||
else shortName = name;
|
||||
|
||||
return "grpc:" + shortName + "[" + partition.identifier() + "]";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,14 +5,12 @@ import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.flywaydb.core.Flyway;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.sql.DataSource;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@@ -71,14 +69,12 @@ public class DatabaseModule extends AbstractModule {
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Singleton
|
||||
@Provides
|
||||
public HikariDataSource provideConnection() {
|
||||
return getMariaDB();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private HikariDataSource getMariaDB() {
|
||||
var connStr = System.getProperty("db.overrideJdbc", dbProperties.getProperty(DB_CONN_KEY));
|
||||
|
||||
@@ -93,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
|
||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||
|
||||
config.setMaximumPoolSize(5);
|
||||
config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
|
||||
config.setMinimumIdle(2);
|
||||
|
||||
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
||||
|
@@ -6,6 +6,10 @@ import nu.marginalia.service.ServiceId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.NetworkInterface;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
@@ -69,6 +73,17 @@ public class ServiceConfigurationModule extends AbstractModule {
|
||||
return configuredValue;
|
||||
}
|
||||
|
||||
if (Boolean.getBoolean("system.multiFace")) {
|
||||
try {
|
||||
String localNetworkIp = getLocalNetworkIP();
|
||||
if (null != localNetworkIp) {
|
||||
return localNetworkIp;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to get local network IP", ex);
|
||||
}
|
||||
}
|
||||
// If we're in docker, we'll use the hostname
|
||||
if (Boolean.getBoolean("service.useDockerHostname")) {
|
||||
return System.getenv("HOSTNAME");
|
||||
@@ -84,10 +99,43 @@ public class ServiceConfigurationModule extends AbstractModule {
|
||||
private String getBindAddress() {
|
||||
String configuredValue = System.getProperty("service.bind-address");
|
||||
if (configuredValue != null) {
|
||||
logger.info("Using configured bind address {}", configuredValue);
|
||||
return configuredValue;
|
||||
}
|
||||
|
||||
return "127.0.0.1";
|
||||
if (Boolean.getBoolean("system.multiFace")) {
|
||||
try {
|
||||
return Objects.requireNonNullElse(getLocalNetworkIP(), "0.0.0.0");
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to get local network IP, falling back to bind to 0.0.0.0", ex);
|
||||
return "0.0.0.0";
|
||||
}
|
||||
}
|
||||
else {
|
||||
return "0.0.0.0";
|
||||
}
|
||||
}
|
||||
|
||||
public static String getLocalNetworkIP() throws IOException {
|
||||
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
||||
|
||||
while (nets.hasMoreElements()) {
|
||||
NetworkInterface netif = nets.nextElement();
|
||||
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
|
||||
if (!netif.isUp() || netif.isLoopback()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||
while (inetAddresses.hasMoreElements()) {
|
||||
InetAddress addr = inetAddresses.nextElement();
|
||||
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
|
||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||
return addr.getHostAddress();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -59,13 +58,17 @@ public class Initialization {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public boolean waitReady() {
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
wait();
|
||||
try {
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
wait();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
throw new RuntimeException("Interrupted while waiting for initialization", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,187 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.jooby.*;
|
||||
import io.prometheus.client.Counter;
|
||||
import nu.marginalia.mq.inbox.MqInboxIf;
|
||||
import nu.marginalia.service.client.ServiceNotAvailableException;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.jte.JteModule;
|
||||
import nu.marginalia.service.server.mq.ServiceMqSubscription;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
public class JoobyService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
// Marker for filtering out sensitive content from the persistent logs
|
||||
private final Marker httpMarker = MarkerFactory.getMarker("HTTP");
|
||||
|
||||
private final Initialization initialization;
|
||||
|
||||
private final static Counter request_counter = Counter.build("wmsa_request_counter", "Request Counter")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_good = Counter.build("wmsa_request_counter_good", "Good Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_bad = Counter.build("wmsa_request_counter_bad", "Bad Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_err = Counter.build("wmsa_request_counter_err", "Error Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final String serviceName;
|
||||
private static volatile boolean initialized = false;
|
||||
|
||||
protected final MqInboxIf messageQueueInbox;
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
private ServiceConfiguration config;
|
||||
private final List<MvcExtension> joobyServices;
|
||||
private final ServiceEndpoint restEndpoint;
|
||||
|
||||
public JoobyService(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices,
|
||||
List<MvcExtension> joobyServices
|
||||
) throws Exception {
|
||||
|
||||
this.joobyServices = joobyServices;
|
||||
this.initialization = params.initialization;
|
||||
config = params.configuration;
|
||||
node = config.node();
|
||||
|
||||
String inboxName = config.serviceName();
|
||||
logger.info("Inbox name: {}", inboxName);
|
||||
|
||||
var serviceRegistry = params.serviceRegistry;
|
||||
|
||||
restEndpoint = serviceRegistry.registerService(ServiceKey.forRest(config.serviceId(), config.node()),
|
||||
config.instanceUuid(), config.externalAddress());
|
||||
|
||||
var mqInboxFactory = params.messageQueueInboxFactory;
|
||||
messageQueueInbox = mqInboxFactory.createSynchronousInbox(inboxName, config.node(), config.instanceUuid());
|
||||
messageQueueInbox.subscribe(new ServiceMqSubscription(this));
|
||||
|
||||
serviceName = System.getProperty("service-name");
|
||||
|
||||
initialization.addCallback(params.heartbeat::start);
|
||||
initialization.addCallback(messageQueueInbox::start);
|
||||
initialization.addCallback(() -> params.eventLog.logEvent("SVC-INIT", serviceName + ":" + config.node()));
|
||||
initialization.addCallback(() -> serviceRegistry.announceInstance(config.instanceUuid()));
|
||||
|
||||
Thread.setDefaultUncaughtExceptionHandler((t, e) -> {
|
||||
if (e instanceof ServiceNotAvailableException) {
|
||||
// reduce log spam for this common case
|
||||
logger.error("Service not available: {}", e.getMessage());
|
||||
}
|
||||
else {
|
||||
logger.error("Uncaught exception", e);
|
||||
}
|
||||
request_counter_err.labels(serviceName, Integer.toString(node)).inc();
|
||||
});
|
||||
|
||||
if (!initialization.isReady() && ! initialized ) {
|
||||
initialized = true;
|
||||
grpcServer = new GrpcServer(config, serviceRegistry, partition, grpcServices);
|
||||
grpcServer.start();
|
||||
}
|
||||
}
|
||||
|
||||
public void startJooby(Jooby jooby) {
|
||||
|
||||
logger.info("{} Listening to {}:{} ({})", getClass().getSimpleName(),
|
||||
restEndpoint.host(),
|
||||
restEndpoint.port(),
|
||||
config.externalAddress());
|
||||
|
||||
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
||||
if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
|
||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||
}
|
||||
if (Files.exists(Path.of("/app/resources/static"))) {
|
||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||
}
|
||||
var options = new ServerOptions();
|
||||
options.setHost(config.bindAddress());
|
||||
options.setPort(restEndpoint.port());
|
||||
|
||||
// Enable gzip compression of response data, but set compression to the lowest level
|
||||
// since it doesn't really save much more space to dial it up. It's typically a
|
||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||
options.setCompressionLevel(1);
|
||||
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
jooby.get("/internal/ping", ctx -> "pong");
|
||||
jooby.get("/internal/started", this::isInitialized);
|
||||
jooby.get("/internal/ready", this::isReady);
|
||||
|
||||
for (var service : joobyServices) {
|
||||
jooby.mvc(service);
|
||||
}
|
||||
|
||||
jooby.before(this::auditRequestIn);
|
||||
jooby.after(this::auditRequestOut);
|
||||
}
|
||||
|
||||
private Object isInitialized(Context ctx) {
|
||||
if (initialization.isReady()) {
|
||||
return "ok";
|
||||
}
|
||||
else {
|
||||
ctx.setResponseCode(StatusCode.FAILED_DEPENDENCY_CODE);
|
||||
return "bad";
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isReady() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private String isReady(Context ctx) {
|
||||
if (isReady()) {
|
||||
return "ok";
|
||||
}
|
||||
else {
|
||||
ctx.setResponseCode(StatusCode.FAILED_DEPENDENCY_CODE);
|
||||
return "bad";
|
||||
}
|
||||
}
|
||||
|
||||
private void auditRequestIn(Context ctx) {
|
||||
request_counter.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
|
||||
private void auditRequestOut(Context ctx, Object result, Throwable failure) {
|
||||
if (ctx.getResponseCode().value() < 400) {
|
||||
request_counter_good.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
else {
|
||||
request_counter_bad.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
|
||||
if (failure != null) {
|
||||
logger.error("Request failed " + ctx.getMethod() + " " + ctx.getRequestURL(), failure);
|
||||
request_counter_err.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -2,28 +2,40 @@ package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.prometheus.client.exporter.MetricsServlet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.eclipse.jetty.server.Server;
|
||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||
import org.eclipse.jetty.servlet.ServletHolder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.InetSocketAddress;
|
||||
|
||||
public class MetricsServer {
|
||||
|
||||
@SneakyThrows
|
||||
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||
|
||||
@Inject
|
||||
public MetricsServer(ServiceConfiguration configuration) {
|
||||
// If less than zero, we forego setting up a metrics server
|
||||
if (configuration.metricsPort() < 0)
|
||||
return;
|
||||
|
||||
Server server = new Server(configuration.metricsPort());
|
||||
ServletContextHandler context = new ServletContextHandler();
|
||||
context.setContextPath("/");
|
||||
server.setHandler(context);
|
||||
try {
|
||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||
|
||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||
ServletContextHandler context = new ServletContextHandler();
|
||||
context.setContextPath("/");
|
||||
server.setHandler(context);
|
||||
|
||||
server.start();
|
||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||
|
||||
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||
|
||||
server.start();
|
||||
}
|
||||
catch (Exception|NoSuchMethodError ex) {
|
||||
logger.error("Failed to set up metrics server", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import org.slf4j.Logger;
|
||||
@@ -57,7 +57,9 @@ public class NodeStatusWatcher {
|
||||
|
||||
private void setupNode() {
|
||||
try {
|
||||
configurationService.create(nodeId, "Node " + nodeId, true, false);
|
||||
NodeProfile profile = NodeProfile.MIXED;
|
||||
|
||||
configurationService.create(nodeId, "Node " + nodeId, true, false, profile);
|
||||
|
||||
fileStorageService.createStorageBase("Index Data", Path.of("/idx"), nodeId, FileStorageBaseType.CURRENT);
|
||||
fileStorageService.createStorageBase("Index Backups", Path.of("/backup"), nodeId, FileStorageBaseType.BACKUP);
|
||||
@@ -81,10 +83,14 @@ public class NodeStatusWatcher {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private boolean isConfigured() {
|
||||
var configuration = configurationService.get(nodeId);
|
||||
return configuration != null;
|
||||
try {
|
||||
var configuration = configurationService.get(nodeId);
|
||||
return configuration != null;
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Look for changes in the configuration and kill the service if the corresponding
|
||||
|
@@ -35,21 +35,8 @@ public class RateLimiter {
|
||||
}
|
||||
|
||||
|
||||
public static RateLimiter forExpensiveRequest() {
|
||||
return new RateLimiter(5, 10);
|
||||
}
|
||||
|
||||
public static RateLimiter custom(int perMinute) {
|
||||
return new RateLimiter(perMinute, 60);
|
||||
}
|
||||
|
||||
public static RateLimiter forSpamBots() {
|
||||
return new RateLimiter(120, 3600);
|
||||
}
|
||||
|
||||
|
||||
public static RateLimiter forLogin() {
|
||||
return new RateLimiter(3, 15);
|
||||
return new RateLimiter(4 * perMinute, perMinute);
|
||||
}
|
||||
|
||||
private void cleanIdleBuckets() {
|
||||
@@ -62,7 +49,7 @@ public class RateLimiter {
|
||||
}
|
||||
|
||||
private Bucket createBucket() {
|
||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
||||
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
|
||||
var bw = Bandwidth.classic(capacity, refill);
|
||||
return Bucket.builder().addLimit(bw).build();
|
||||
}
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.prometheus.client.Counter;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.mq.inbox.MqInboxIf;
|
||||
import nu.marginalia.service.client.ServiceNotAvailableException;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -17,7 +16,7 @@ import spark.Spark;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class Service {
|
||||
public class SparkService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
// Marker for filtering out sensitive content from the persistent logs
|
||||
@@ -44,11 +43,10 @@ public class Service {
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
@SneakyThrows
|
||||
public Service(BaseServiceParams params,
|
||||
Runnable configureStaticFiles,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) {
|
||||
public SparkService(BaseServiceParams params,
|
||||
Runnable configureStaticFiles,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
|
||||
this.initialization = params.initialization;
|
||||
var config = params.configuration;
|
||||
@@ -128,18 +126,18 @@ public class Service {
|
||||
}
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) {
|
||||
public SparkService(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
SparkService::defaultSparkConfig,
|
||||
partition,
|
||||
grpcServices);
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params) {
|
||||
public SparkService(BaseServiceParams params) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
SparkService::defaultSparkConfig,
|
||||
ServicePartition.any(),
|
||||
List.of());
|
||||
}
|
@@ -1,20 +1,18 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
import spark.resource.ClassPathResource;
|
||||
import spark.staticfiles.MimeType;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
|
||||
public class StaticResources {
|
||||
private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC);
|
||||
|
||||
@SneakyThrows
|
||||
public void serveStatic(String domain, String path, Request req, Response rsp) {
|
||||
try {
|
||||
if (path.startsWith("..") || domain.startsWith("..")) {
|
||||
@@ -28,7 +26,7 @@ public class StaticResources {
|
||||
|
||||
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
|
||||
}
|
||||
catch (IllegalArgumentException | FileNotFoundException ex) {
|
||||
catch (IllegalArgumentException | IOException ex) {
|
||||
Spark.halt(404);
|
||||
}
|
||||
}
|
||||
@@ -57,7 +55,6 @@ public class StaticResources {
|
||||
return "application/octet-stream";
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) {
|
||||
rsp.header("Cache-Control", "public,max-age=3600");
|
||||
rsp.type(MimeType.fromResource(resource));
|
||||
|
@@ -0,0 +1,61 @@
|
||||
package nu.marginalia.service.server.jte;
|
||||
|
||||
import edu.umd.cs.findbugs.annotations.NonNull;
|
||||
import edu.umd.cs.findbugs.annotations.Nullable;
|
||||
import gg.jte.ContentType;
|
||||
import gg.jte.TemplateEngine;
|
||||
import gg.jte.resolve.DirectoryCodeResolver;
|
||||
import io.jooby.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
// Temporary workaround for a bug
|
||||
// APL-2.0 https://github.com/jooby-project/jooby
|
||||
public class JteModule implements Extension {
|
||||
private Path sourceDirectory;
|
||||
private Path classDirectory;
|
||||
private TemplateEngine templateEngine;
|
||||
|
||||
public JteModule(@NonNull Path sourceDirectory, @NonNull Path classDirectory) {
|
||||
this.sourceDirectory = (Path)Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
this.classDirectory = (Path)Objects.requireNonNull(classDirectory, "Class directory is required.");
|
||||
}
|
||||
|
||||
public JteModule(@NonNull Path sourceDirectory) {
|
||||
this.sourceDirectory = (Path)Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
}
|
||||
|
||||
public JteModule(@NonNull TemplateEngine templateEngine) {
|
||||
this.templateEngine = (TemplateEngine)Objects.requireNonNull(templateEngine, "Template engine is required.");
|
||||
}
|
||||
|
||||
public void install(@NonNull Jooby application) {
|
||||
if (this.templateEngine == null) {
|
||||
this.templateEngine = create(application.getEnvironment(), this.sourceDirectory, this.classDirectory);
|
||||
}
|
||||
|
||||
ServiceRegistry services = application.getServices();
|
||||
services.put(TemplateEngine.class, this.templateEngine);
|
||||
application.encoder(MediaType.html, new JteTemplateEngine(this.templateEngine));
|
||||
}
|
||||
|
||||
public static TemplateEngine create(@NonNull Environment environment, @NonNull Path sourceDirectory, @Nullable Path classDirectory) {
|
||||
boolean dev = environment.isActive("dev", new String[]{"test"});
|
||||
if (dev) {
|
||||
Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
Path requiredClassDirectory = (Path)Optional.ofNullable(classDirectory).orElseGet(() -> sourceDirectory.resolve("jte-classes"));
|
||||
TemplateEngine engine = TemplateEngine.create(new DirectoryCodeResolver(sourceDirectory), requiredClassDirectory, ContentType.Html, environment.getClassLoader());
|
||||
Optional<List<String>> var10000 = Optional.ofNullable(System.getProperty("jooby.run.classpath")).map((it) -> it.split(File.pathSeparator)).map(Stream::of).map(Stream::toList);
|
||||
Objects.requireNonNull(engine);
|
||||
var10000.ifPresent(engine::setClassPath);
|
||||
return engine;
|
||||
} else {
|
||||
return classDirectory == null ? TemplateEngine.createPrecompiled(ContentType.Html) : TemplateEngine.createPrecompiled(classDirectory, ContentType.Html);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,48 @@
|
||||
package nu.marginalia.service.server.jte;
|
||||
|
||||
import edu.umd.cs.findbugs.annotations.NonNull;
|
||||
import gg.jte.TemplateEngine;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import io.jooby.buffer.DataBuffer;
|
||||
import io.jooby.internal.jte.DataBufferOutput;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
// Temporary workaround for a bug
|
||||
// APL-2.0 https://github.com/jooby-project/jooby
|
||||
class JteTemplateEngine implements io.jooby.TemplateEngine {
|
||||
private final TemplateEngine jte;
|
||||
private final List<String> extensions;
|
||||
|
||||
public JteTemplateEngine(TemplateEngine jte) {
|
||||
this.jte = jte;
|
||||
this.extensions = List.of(".jte", ".kte");
|
||||
}
|
||||
|
||||
|
||||
@NonNull @Override
|
||||
public List<String> extensions() {
|
||||
return extensions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DataBuffer render(Context ctx, ModelAndView modelAndView) {
|
||||
var buffer = ctx.getBufferFactory().allocateBuffer();
|
||||
var output = new DataBufferOutput(buffer, StandardCharsets.UTF_8);
|
||||
var attributes = ctx.getAttributes();
|
||||
if (modelAndView instanceof MapModelAndView mapModelAndView) {
|
||||
var mapModel = new HashMap<String, Object>();
|
||||
mapModel.putAll(attributes);
|
||||
mapModel.putAll(mapModelAndView.getModel());
|
||||
jte.render(modelAndView.getView(), mapModel, output);
|
||||
} else {
|
||||
jte.render(modelAndView.getView(), modelAndView.getModel(), output);
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
}
|
@@ -3,7 +3,6 @@ package nu.marginalia.service.server.mq;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSubscription;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -15,10 +14,10 @@ import java.util.Map;
|
||||
public class ServiceMqSubscription implements MqSubscription {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ServiceMqSubscription.class);
|
||||
private final Map<String, Method> requests = new HashMap<>();
|
||||
private final Service service;
|
||||
private final Object service;
|
||||
|
||||
|
||||
public ServiceMqSubscription(Service service) {
|
||||
public ServiceMqSubscription(Object service) {
|
||||
this.service = service;
|
||||
|
||||
/* Wire up all methods annotated with @MqRequest and @MqNotification
|
||||
|
@@ -24,7 +24,7 @@ public class NamedExecutorFactory {
|
||||
|
||||
@Override
|
||||
public Thread newThread(@NotNull Runnable r) {
|
||||
var thread = new Thread(r, STR."\{name}[\{threadNumber.getAndIncrement()}]");
|
||||
var thread = new Thread(r, name + "[" + threadNumber.getAndIncrement() + "]");
|
||||
thread.setDaemon(true);
|
||||
return thread;
|
||||
}
|
||||
|
@@ -3,8 +3,16 @@
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
@@ -13,15 +21,29 @@
|
||||
<Filters>
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
|
@@ -1,10 +1,49 @@
|
||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
@@ -17,14 +56,30 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ConsoleInfo"/>
|
||||
<AppenderRef ref="ConsoleWarn"/>
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
|
@@ -1,15 +1,50 @@
|
||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="ConsoleInfo"/>
|
||||
<AppenderRef ref="ConsoleWarn"/>
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -1,9 +1,9 @@
|
||||
package nu.marginalia.service.discovery;
|
||||
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.test.TestApiGrpc;
|
||||
import org.apache.curator.framework.CuratorFrameworkFactory;
|
||||
import org.apache.curator.retry.ExponentialBackoffRetry;
|
||||
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
|
||||
class ZkServiceRegistryTest {
|
||||
private static final int ZOOKEEPER_PORT = 2181;
|
||||
private static final GenericContainer<?> zookeeper =
|
||||
new GenericContainer<>("zookeeper:3.8.0")
|
||||
new GenericContainer<>("zookeeper:3.8")
|
||||
.withExposedPorts(ZOOKEEPER_PORT);
|
||||
|
||||
List<ZkServiceRegistry> registries = new ArrayList<>();
|
||||
@@ -33,7 +33,7 @@ class ZkServiceRegistryTest {
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
zookeeper.start();
|
||||
connectString = STR."\{zookeeper.getHost()}:\{zookeeper.getMappedPort(ZOOKEEPER_PORT)}";
|
||||
connectString = zookeeper.getHost() + ":" + zookeeper.getMappedPort(ZOOKEEPER_PORT);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
|
@@ -9,23 +9,25 @@ import nu.marginalia.executor.storage.FileStorageFile;
|
||||
import nu.marginalia.executor.upload.UploadDirContents;
|
||||
import nu.marginalia.executor.upload.UploadDirItem;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.*;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.*;
|
||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
||||
|
||||
@Singleton
|
||||
public class ExecutorClient {
|
||||
@@ -163,8 +165,8 @@ public class ExecutorClient {
|
||||
* The endpoint is compatible with range requests.
|
||||
* */
|
||||
public URL remoteFileURL(FileStorage fileStorage, String path) {
|
||||
String uriPath = STR."/transfer/file/\{fileStorage.id()}";
|
||||
String uriQuery = STR."path=\{URLEncoder.encode(path, StandardCharsets.UTF_8)}";
|
||||
String uriPath = "/transfer/file/" + fileStorage.id();
|
||||
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
|
||||
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
|
||||
if (endpoints.isEmpty()) {
|
||||
@@ -180,4 +182,10 @@ public class ExecutorClient {
|
||||
}
|
||||
}
|
||||
|
||||
public void restartExecutorService(int node) {
|
||||
channelPool.call(ExecutorApiBlockingStub::restartExecutorService)
|
||||
.forNode(node)
|
||||
.run(Empty.getDefaultInstance());
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.executor.client;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -11,6 +12,8 @@ import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
import static nu.marginalia.functions.execution.api.ExecutorExportApiGrpc.ExecutorExportApiBlockingStub;
|
||||
|
||||
@Singleton
|
||||
@@ -18,47 +21,64 @@ public class ExecutorExportClient {
|
||||
private final GrpcMultiNodeChannelPool<ExecutorExportApiBlockingStub> channelPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorExportClient.class);
|
||||
|
||||
private final MqPersistence persistence;
|
||||
@Inject
|
||||
public ExecutorExportClient(GrpcChannelPoolFactory grpcChannelPoolFactory)
|
||||
public ExecutorExportClient(GrpcChannelPoolFactory grpcChannelPoolFactory, MqPersistence persistence)
|
||||
{
|
||||
this.channelPool = grpcChannelPoolFactory
|
||||
.createMulti(
|
||||
ServiceKey.forGrpcApi(ExecutorExportApiGrpc.class, ServicePartition.multi()),
|
||||
ExecutorExportApiGrpc::newBlockingStub);
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
|
||||
return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
|
||||
}
|
||||
|
||||
public long exportAtags(int node, FileStorageId fid) throws Exception {
|
||||
long msgId = createTrackingTokenMsg("atags", node, Duration.ofHours(6));
|
||||
|
||||
public void exportAtags(int node, FileStorageId fid) {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportAtags)
|
||||
.forNode(node)
|
||||
.run(RpcFileStorageId.newBuilder()
|
||||
.run(RpcExportRequest.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setMsgId(msgId)
|
||||
.build());
|
||||
return msgId;
|
||||
}
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
|
||||
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
||||
.forNode(node)
|
||||
.run(RpcExportSampleData.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setSize(size)
|
||||
.setCtFilter(ctFilter)
|
||||
.setName(name)
|
||||
.build());
|
||||
}
|
||||
|
||||
public void exportRssFeeds(int node, FileStorageId fid) {
|
||||
public long exportRssFeeds(int node, FileStorageId fid) throws Exception {
|
||||
long msgId = createTrackingTokenMsg("rss", node, Duration.ofHours(6));
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportRssFeeds)
|
||||
.forNode(node)
|
||||
.run(RpcFileStorageId.newBuilder()
|
||||
.run(RpcExportRequest.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setMsgId(msgId)
|
||||
.build());
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public void exportTermFrequencies(int node, FileStorageId fid) {
|
||||
public long exportTermFrequencies(int node, FileStorageId fid) throws Exception {
|
||||
long msgId = createTrackingTokenMsg("tfreq", node, Duration.ofHours(6));
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportTermFrequencies)
|
||||
.forNode(node)
|
||||
.run(RpcFileStorageId.newBuilder()
|
||||
.run(RpcExportRequest.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setMsgId(msgId)
|
||||
.build());
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public void exportData(int node) {
|
||||
@@ -77,4 +97,21 @@ public class ExecutorExportClient {
|
||||
}
|
||||
|
||||
|
||||
public void exportAllAtags() {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportAllAtags)
|
||||
.forNode(1)
|
||||
.run(Empty.getDefaultInstance());
|
||||
}
|
||||
|
||||
public void exportAllFeeds() {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportAllFeeds)
|
||||
.forNode(1)
|
||||
.run(Empty.getDefaultInstance());
|
||||
}
|
||||
|
||||
public void exportAllTfreqs() {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportAllTfreqs)
|
||||
.forNode(1)
|
||||
.run(Empty.getDefaultInstance());
|
||||
}
|
||||
}
|
||||
|
@@ -17,6 +17,8 @@ service ExecutorApi {
|
||||
rpc downloadSampleData(RpcDownloadSampleData) returns (Empty) {}
|
||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||
|
||||
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||
}
|
||||
|
||||
service ExecutorCrawlApi {
|
||||
@@ -37,15 +39,20 @@ service ExecutorSideloadApi {
|
||||
}
|
||||
|
||||
service ExecutorExportApi {
|
||||
rpc exportAtags(RpcFileStorageId) returns (Empty) {}
|
||||
rpc exportAtags(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportSegmentationModel(RpcExportSegmentationModel) returns (Empty) {}
|
||||
rpc exportSampleData(RpcExportSampleData) returns (Empty) {}
|
||||
rpc exportRssFeeds(RpcFileStorageId) returns (Empty) {}
|
||||
rpc exportTermFrequencies(RpcFileStorageId) returns (Empty) {}
|
||||
rpc exportRssFeeds(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportTermFrequencies(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportData(Empty) returns (Empty) {}
|
||||
|
||||
rpc exportAllAtags(Empty) returns (Empty) {}
|
||||
rpc exportAllFeeds(Empty) returns (Empty) {}
|
||||
rpc exportAllTfreqs(Empty) returns (Empty) {}
|
||||
}
|
||||
|
||||
message Empty {}
|
||||
|
||||
message RpcFsmName {
|
||||
string actorName = 1;
|
||||
}
|
||||
@@ -55,6 +62,10 @@ message RpcProcessId {
|
||||
message RpcFileStorageId {
|
||||
int64 fileStorageId = 1;
|
||||
}
|
||||
message RpcExportRequest {
|
||||
int64 fileStorageId = 1;
|
||||
int64 msgId = 2;
|
||||
}
|
||||
message RpcFileStorageIdWithDomainName {
|
||||
int64 fileStorageId = 1;
|
||||
string targetDomainName = 2;
|
||||
@@ -89,6 +100,7 @@ message RpcExportSampleData {
|
||||
int64 fileStorageId = 1;
|
||||
int32 size = 2;
|
||||
string name = 3;
|
||||
string ctFilter = 4;
|
||||
}
|
||||
message RpcDownloadSampleData {
|
||||
string sampleSet = 1;
|
||||
|
@@ -15,15 +15,16 @@ dependencies {
|
||||
// These look weird but they're needed to be able to spawn the processes
|
||||
// from the executor service
|
||||
|
||||
implementation project(':code:processes:website-adjacencies-calculator')
|
||||
implementation project(':code:processes:export-task-process')
|
||||
implementation project(':code:processes:crawling-process')
|
||||
implementation project(':code:processes:live-crawling-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:linkdb')
|
||||
|
||||
@@ -35,13 +36,14 @@ dependencies {
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:execution:data-extractors')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
|
@@ -1,104 +0,0 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.compress.utils.IOUtils;
|
||||
|
||||
public class SampleDataExporter {
|
||||
private final FileStorageService storageService;
|
||||
|
||||
@Inject
|
||||
public SampleDataExporter(FileStorageService storageService) {
|
||||
this.storageService = storageService;
|
||||
}
|
||||
public void export(FileStorageId crawlId, FileStorageId destId, int size, String name) throws SQLException, IOException {
|
||||
FileStorage destStorage = storageService.getStorage(destId);
|
||||
Path inputDir = storageService.getStorage(crawlId).asPath();
|
||||
|
||||
Path crawlerLogFile = inputDir.resolve("crawler.log");
|
||||
|
||||
List<WorkLogEntry> entriesAll = new ArrayList<>(100_000);
|
||||
|
||||
for (var item : WorkLog.iterable(crawlerLogFile)) {
|
||||
if (item.cnt() < 2) // this one's too small
|
||||
continue;
|
||||
if (item.cnt() > 5000) // this one's too big
|
||||
continue;
|
||||
if (item.relPath().length() > 90) // this one's too long
|
||||
continue; // TAR file name limit is 100, but we add some extra for good measure
|
||||
|
||||
// this one's just right
|
||||
entriesAll.add(item);
|
||||
}
|
||||
|
||||
if (entriesAll.size() > size) {
|
||||
Collections.shuffle(entriesAll);
|
||||
entriesAll = entriesAll.subList(0, size);
|
||||
}
|
||||
|
||||
Path newCrawlerLogFile = Files.createTempFile(destStorage.asPath(), "crawler", ".log",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
for (var item : entriesAll) {
|
||||
bw.write(STR."\{item.id()} \{item.ts()} \{item.relPath()} \{item.cnt()}\n");
|
||||
}
|
||||
}
|
||||
|
||||
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
Files.writeString(newManifestJsonFile, STR."""
|
||||
{ "description": "\{name.replace("[\"\\]", "_")}",
|
||||
"type": "CRAWL_DATA" }
|
||||
""");
|
||||
|
||||
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
try (var stream = new TarArchiveOutputStream(Files.newOutputStream(tmpTarFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
||||
for (var item : entriesAll) {
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
if (!Files.exists(crawlDataPath)) continue;
|
||||
|
||||
addFileToTar(stream, crawlDataPath, item.relPath());
|
||||
}
|
||||
|
||||
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
|
||||
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(newCrawlerLogFile);
|
||||
Files.deleteIfExists(newManifestJsonFile);
|
||||
}
|
||||
|
||||
Files.move(tmpTarFile, destStorage.asPath().resolve("crawl-data.tar"), StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
|
||||
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
|
||||
entry.setSize(Files.size(file));
|
||||
outputStream.putArchiveEntry(entry);
|
||||
|
||||
try (var fis = Files.newInputStream(file)) {
|
||||
IOUtils.copy(fis, outputStream);
|
||||
} finally {
|
||||
outputStream.closeArchiveEntry();
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,7 +0,0 @@
|
||||
Contains converter-*like* extraction jobs that operate on crawled data to produce export files.
|
||||
|
||||
## Important classes
|
||||
|
||||
* [AtagExporter](java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data.
|
||||
* [FeedExporter](java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data.
|
||||
* [TermFrequencyExporter](java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF.
|
@@ -1,28 +1,44 @@
|
||||
package nu.marginalia.actor;
|
||||
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
CRAWL,
|
||||
RECRAWL,
|
||||
RECRAWL_SINGLE_DOMAIN,
|
||||
CONVERT_AND_LOAD,
|
||||
PROC_CONVERTER_SPAWNER,
|
||||
PROC_LOADER_SPAWNER,
|
||||
PROC_CRAWLER_SPAWNER,
|
||||
MONITOR_PROCESS_LIVENESS,
|
||||
MONITOR_FILE_STORAGE,
|
||||
ADJACENCY_CALCULATION,
|
||||
CRAWL_JOB_EXTRACTOR,
|
||||
EXPORT_DATA,
|
||||
EXPORT_SEGMENTATION_MODEL,
|
||||
EXPORT_ATAGS,
|
||||
EXPORT_TERM_FREQUENCIES,
|
||||
EXPORT_FEEDS,
|
||||
PROC_INDEX_CONSTRUCTOR_SPAWNER,
|
||||
CONVERT,
|
||||
RESTORE_BACKUP,
|
||||
EXPORT_SAMPLE_DATA,
|
||||
DOWNLOAD_SAMPLE,
|
||||
SCRAPE_FEEDS;
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_ATAGS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_TERM_FREQUENCIES(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
MIGRATE_CRAWL_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
||||
PROC_CONVERTER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
PROC_LOADER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
RESTORE_BACKUP(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
CONVERT(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
|
||||
CONVERT_AND_LOAD(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME, NodeProfile.SIDELOAD),
|
||||
MONITOR_PROCESS_LIVENESS(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
MONITOR_FILE_STORAGE(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
PROC_INDEX_CONSTRUCTOR_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
|
||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||
UPDATE_RSS(NodeProfile.REALTIME)
|
||||
;
|
||||
|
||||
public String id() {
|
||||
return "fsm:" + name().toLowerCase();
|
||||
@@ -32,4 +48,9 @@ public enum ExecutorActor {
|
||||
return "fsm:" + name().toLowerCase() + ":" + node;
|
||||
}
|
||||
|
||||
ExecutorActor(NodeProfile... profileSet) {
|
||||
this.profileSet = Set.of(profileSet);
|
||||
}
|
||||
|
||||
public Set<NodeProfile> profileSet;
|
||||
}
|
||||
|
@@ -2,8 +2,8 @@ package nu.marginalia.actor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.actor.monitor.FileStorageMonitorActor;
|
||||
import nu.marginalia.actor.precession.ExportAllPrecessionActor;
|
||||
import nu.marginalia.actor.proc.*;
|
||||
import nu.marginalia.actor.prototype.ActorPrototype;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
@@ -11,9 +11,15 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.task.*;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
@@ -27,16 +33,25 @@ public class ExecutorActorControlService {
|
||||
public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>();
|
||||
private final int node;
|
||||
|
||||
private final NodeConfiguration nodeConfiguration;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public ExecutorActorControlService(MessageQueueFactory messageQueueFactory,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NodeConfigurationService configurationService,
|
||||
BaseServiceParams baseServiceParams,
|
||||
ConvertActor convertActor,
|
||||
ConvertAndLoadActor convertAndLoadActor,
|
||||
CrawlActor crawlActor,
|
||||
LiveCrawlActor liveCrawlActor,
|
||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||
RestoreBackupActor restoreBackupActor,
|
||||
ConverterMonitorActor converterMonitorFSM,
|
||||
PingMonitorActor pingMonitorActor,
|
||||
CrawlerMonitorActor crawlerMonitorActor,
|
||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||
LoaderMonitorActor loaderMonitor,
|
||||
ProcessLivenessMonitorActor processMonitorFSM,
|
||||
FileStorageMonitorActor fileStorageMonitorActor,
|
||||
@@ -48,15 +63,23 @@ public class ExecutorActorControlService {
|
||||
ExportSampleDataActor exportSampleDataActor,
|
||||
ExportTermFreqActor exportTermFrequenciesActor,
|
||||
ExportSegmentationModelActor exportSegmentationModelActor,
|
||||
ExportTaskMonitorActor exportTasksMonitorActor,
|
||||
DownloadSampleActor downloadSampleActor,
|
||||
ScrapeFeedsActor scrapeFeedsActor,
|
||||
ExecutorActorStateMachines stateMachines) {
|
||||
ExecutorActorStateMachines stateMachines,
|
||||
MigrateCrawlDataActor migrateCrawlDataActor,
|
||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||
UpdateNsfwFiltersActor updateNsfwFiltersActor,
|
||||
UpdateRssActor updateRssActor) throws SQLException {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.eventLog = baseServiceParams.eventLog;
|
||||
this.stateMachines = stateMachines;
|
||||
this.node = baseServiceParams.configuration.node();
|
||||
|
||||
this.nodeConfiguration = configurationService.get(node);
|
||||
|
||||
register(ExecutorActor.CRAWL, crawlActor);
|
||||
register(ExecutorActor.LIVE_CRAWL, liveCrawlActor);
|
||||
register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor);
|
||||
|
||||
register(ExecutorActor.CONVERT, convertActor);
|
||||
@@ -67,6 +90,9 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||
|
||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||
@@ -83,9 +109,22 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.DOWNLOAD_SAMPLE, downloadSampleActor);
|
||||
|
||||
register(ExecutorActor.SCRAPE_FEEDS, scrapeFeedsActor);
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
}
|
||||
}
|
||||
|
||||
private void register(ExecutorActor process, RecordActorPrototype graph) {
|
||||
|
||||
if (!process.profileSet.contains(nodeConfiguration.profile())) {
|
||||
return;
|
||||
}
|
||||
|
||||
var sm = new ActorStateMachine(messageQueueFactory, process.id(), node, UUID.randomUUID(), graph);
|
||||
sm.listen((function, param) -> logStateChange(process, function));
|
||||
|
||||
@@ -117,11 +156,15 @@ public class ExecutorActorControlService {
|
||||
stateMachines.startFromJSON(process, state, json);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void stop(ExecutorActor process) {
|
||||
eventLog.logEvent("FSM-STOP", process.id());
|
||||
|
||||
stateMachines.stop(process);
|
||||
try {
|
||||
stateMachines.stop(process);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to stop FSM", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Map<ExecutorActor, ActorStateInstance> getActorStates() {
|
||||
|
@@ -0,0 +1,116 @@
|
||||
package nu.marginalia.actor.precession;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.executor.client.ExecutorExportClient;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Comparator;
|
||||
import java.util.Optional;
|
||||
|
||||
public class ExportAllPrecessionActor extends RecordActorPrototype {
|
||||
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final ExecutorExportClient exportClient;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final MqPersistence persistence;
|
||||
|
||||
@Inject
|
||||
public ExportAllPrecessionActor(Gson gson,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
ExecutorExportClient exportClient,
|
||||
FileStorageService fileStorageService,
|
||||
MqPersistence persistence)
|
||||
{
|
||||
super(gson);
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.exportClient = exportClient;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
public enum ExportTask {
|
||||
FEEDS,
|
||||
ATAGS,
|
||||
TFREQ
|
||||
}
|
||||
|
||||
public record Initial(ExportTask task) implements ActorStep {}
|
||||
public record Export(int nodeId, ExportTask task, long msgId) implements ActorStep {
|
||||
public Export(int nodeId, ExportTask task) {
|
||||
this(nodeId, task, -1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial(ExportTask task) -> {
|
||||
var firstNode = nextNodeId(-1);
|
||||
if (firstNode.isEmpty())
|
||||
yield new Error("No nodes included in precession");
|
||||
else
|
||||
yield new Export(firstNode.get(), task);
|
||||
}
|
||||
|
||||
case Export(int nodeId, ExportTask task, long msgId) when msgId < 0 -> {
|
||||
var activeStorages = fileStorageService.getActiveFileStorages(nodeId, FileStorageType.CRAWL_DATA);
|
||||
if (activeStorages.isEmpty()) {
|
||||
yield new Error("Node " + nodeId + " has no active file storage");
|
||||
}
|
||||
var activeCrawlStorageId = activeStorages.getFirst();
|
||||
|
||||
long trackingMsgId = switch(task) {
|
||||
case ATAGS -> exportClient.exportAtags(nodeId, activeCrawlStorageId);
|
||||
case TFREQ -> exportClient.exportTermFrequencies(nodeId, activeCrawlStorageId);
|
||||
case FEEDS -> exportClient.exportRssFeeds(nodeId, activeCrawlStorageId);
|
||||
};
|
||||
|
||||
yield new Export(nodeId, task, trackingMsgId);
|
||||
}
|
||||
|
||||
case Export(int nodeId, ExportTask task, long msgId) -> {
|
||||
for (; ; ) {
|
||||
var msg = persistence.getMessage(msgId);
|
||||
if (!msg.state().isTerminal()) {
|
||||
Thread.sleep(Duration.ofSeconds(30));
|
||||
continue;
|
||||
}
|
||||
if (msg.state() == MqMessageState.OK) {
|
||||
var nextNode = nextNodeId(nodeId);
|
||||
if (nextNode.isEmpty()) {
|
||||
yield new End();
|
||||
} else {
|
||||
yield new Export(nextNode.get(), task);
|
||||
}
|
||||
} else {
|
||||
yield new Error("Export failed for node " + nodeId);
|
||||
}
|
||||
}
|
||||
}
|
||||
default -> new Error("Unknown state");
|
||||
};
|
||||
}
|
||||
|
||||
private Optional<Integer> nextNodeId(int currentNodeId) {
|
||||
return nodeConfigurationService.getAll()
|
||||
.stream().sorted(Comparator.comparing(NodeConfiguration::node))
|
||||
.filter(node -> node.node() > currentNodeId)
|
||||
.filter(NodeConfiguration::includeInPrecession)
|
||||
.map(NodeConfiguration::node)
|
||||
.findFirst();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Runs an export job on each index node included in the precession";
|
||||
}
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class ExportTaskMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public ExportTaskMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||
ProcessService.ProcessId.EXPORT_TASKS);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class LiveCrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public LiveCrawlerMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||
ProcessService.ProcessId.LIVE_CRAWLER);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,26 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class PingMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public PingMonitorActor(Gson gson, ServiceConfiguration configuration, MqPersistence persistence, ProcessService processService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
ProcessInboxNames.PING_INBOX,
|
||||
ProcessService.ProcessId.PING);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -10,6 +10,8 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -39,6 +41,7 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
private final Duration pollInterval = Duration.ofHours(6);
|
||||
|
||||
private final ServiceEventLog eventLog;
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final HikariDataSource dataSource;
|
||||
private final int nodeId;
|
||||
|
||||
@@ -54,8 +57,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
if (nodeId > 1) {
|
||||
yield new End();
|
||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||
yield new Error("Invalid node profile for RSS update");
|
||||
}
|
||||
else {
|
||||
yield new Wait(LocalDateTime.now().toString());
|
||||
@@ -177,10 +180,12 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
public ScrapeFeedsActor(Gson gson,
|
||||
ServiceEventLog eventLog,
|
||||
ServiceConfiguration configuration,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
super(gson);
|
||||
this.eventLog = eventLog;
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.dataSource = dataSource;
|
||||
this.nodeId = configuration.node();
|
||||
}
|
||||
|
144
code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java
Normal file
144
code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java
Normal file
@@ -0,0 +1,144 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.api.feeds.RpcFeedUpdateMode;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
public class UpdateRssActor extends RecordActorPrototype {
|
||||
|
||||
private final FeedsClient feedsClient;
|
||||
private final int nodeId;
|
||||
|
||||
private final Duration initialDelay = Duration.ofMinutes(5);
|
||||
private final Duration updateInterval = Duration.ofHours(24);
|
||||
private final int cleanInterval = 60;
|
||||
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final MqPersistence persistence;
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
|
||||
|
||||
@Inject
|
||||
public UpdateRssActor(Gson gson,
|
||||
FeedsClient feedsClient,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
MqPersistence persistence) {
|
||||
super(gson);
|
||||
this.feedsClient = feedsClient;
|
||||
this.nodeId = serviceConfiguration.node();
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Wait(String ts, int refreshCount) implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record UpdateRefresh(int refreshCount, long msgId) implements ActorStep {
|
||||
public UpdateRefresh(int refreshCount) {
|
||||
this(refreshCount, -1);
|
||||
}
|
||||
}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record UpdateClean(long msgId) implements ActorStep {
|
||||
public UpdateClean() {
|
||||
this(-1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial() -> {
|
||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||
yield new Error("Invalid node profile for RSS update");
|
||||
}
|
||||
else {
|
||||
// Wait for 5 minutes before starting the first update, to give the system time to start up properly
|
||||
yield new Wait(LocalDateTime.now().plus(initialDelay).toString(), 0);
|
||||
}
|
||||
}
|
||||
case Wait(String untilTs, int count) -> {
|
||||
var until = LocalDateTime.parse(untilTs);
|
||||
var now = LocalDateTime.now();
|
||||
|
||||
long remaining = Duration.between(now, until).toMillis();
|
||||
|
||||
if (remaining > 0) {
|
||||
Thread.sleep(remaining);
|
||||
yield new Wait(untilTs, count);
|
||||
}
|
||||
else {
|
||||
|
||||
// Once every `cleanInterval` updates, do a clean update;
|
||||
// otherwise do a refresh update
|
||||
if (count > cleanInterval) {
|
||||
yield new UpdateClean();
|
||||
}
|
||||
else {
|
||||
yield new UpdateRefresh(count);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
case UpdateRefresh(int count, long msgId) when msgId < 0 -> {
|
||||
long messageId = feedsClient.updateFeeds(RpcFeedUpdateMode.REFRESH);
|
||||
yield new UpdateRefresh(count, messageId);
|
||||
}
|
||||
case UpdateRefresh(int count, long msgId) -> {
|
||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||
if (msg == null) {
|
||||
logger.warn("UpdateRefresh is taking a very long time");
|
||||
yield new UpdateRefresh(count, msgId);
|
||||
} else if (msg.state() != MqMessageState.OK) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: " + msg.state());
|
||||
}
|
||||
else {
|
||||
// Increment the refresh count
|
||||
yield new Wait(LocalDateTime.now().plus(updateInterval).toString(), count + 1);
|
||||
}
|
||||
}
|
||||
case UpdateClean(long msgId) when msgId < 0 -> {
|
||||
long messageId = feedsClient.updateFeeds(RpcFeedUpdateMode.CLEAN);
|
||||
yield new UpdateClean(messageId);
|
||||
}
|
||||
case UpdateClean(long msgId) -> {
|
||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||
if (msg == null) {
|
||||
logger.warn("UpdateClean is taking a very long time");
|
||||
yield new UpdateClean(msgId);
|
||||
} else if (msg.state() != MqMessageState.OK) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: " + msg.state());
|
||||
}
|
||||
else {
|
||||
// Reset the refresh count after a successful update
|
||||
yield new Wait(LocalDateTime.now().plus(updateInterval).toString(), 0);
|
||||
}
|
||||
}
|
||||
default -> new Error("Unknown actor step: " + self);
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Periodically updates RSS and Atom feeds";
|
||||
}
|
||||
}
|
@@ -8,6 +8,9 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.encyclopedia.EncyclopediaConverter;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.sideload.RedditSideloadHelper;
|
||||
@@ -17,9 +20,6 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -140,7 +140,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
// To avoid re-converting the same file, we'll assign the file a name based on its hash
|
||||
// and the original filename. This way, if we're fed the same file again, we'll be able to just
|
||||
// re-use the predigested database file.
|
||||
yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl);
|
||||
yield new PredigestEncyclopedia(source, source + "." + hash + ".db", baseUrl);
|
||||
} else if (!source.endsWith(".db")) {
|
||||
yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)");
|
||||
}
|
||||
|
@@ -3,9 +3,6 @@ package nu.marginalia.actor.task;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.With;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
@@ -40,7 +37,6 @@ import java.util.List;
|
||||
public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
|
||||
// STATES
|
||||
|
||||
public static final String RERANK = "RERANK";
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox mqConverterOutbox;
|
||||
@@ -54,15 +50,6 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
private final int nodeId;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
|
||||
@AllArgsConstructor @With @NoArgsConstructor
|
||||
public static class Message {
|
||||
public FileStorageId crawlStorageId = null;
|
||||
public List<FileStorageId> processedStorageId = null;
|
||||
public long converterMsgId = 0L;
|
||||
public long loaderMsgId = 0L;
|
||||
}
|
||||
|
||||
public record Initial(FileStorageId fid) implements ActorStep {}
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.ERROR)
|
||||
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
Files.deleteIfExists(Path.of(tarFileName));
|
||||
|
||||
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
is.transferTo(os);
|
||||
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
|
||||
|
||||
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
|
||||
long size = urlConnection.getContentLengthLong();
|
||||
byte[] buffer = new byte[8192];
|
||||
|
||||
try (var is = new BufferedInputStream(urlConnection.getInputStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
long copiedSize = 0;
|
||||
|
||||
while (copiedSize < size) {
|
||||
int read = is.read(buffer);
|
||||
|
||||
if (read < 0) // We've been promised a file of length 'size'
|
||||
throw new IOException("Unexpected end of stream");
|
||||
|
||||
os.write(buffer, 0, read);
|
||||
copiedSize += read;
|
||||
|
||||
// Update progress bar
|
||||
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
||||
logger.error("Error downloading sample", ex);
|
||||
yield new Error();
|
||||
}
|
||||
finally {
|
||||
urlConnection.disconnect();
|
||||
}
|
||||
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
||||
yield new Extract(fileStorageId, tarFileName);
|
||||
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public DownloadSampleActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
ServiceEventLog eventLog)
|
||||
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.eventLog = eventLog;
|
||||
this.heartbeat = heartbeat;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -3,50 +3,78 @@ package nu.marginalia.actor.task;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.extractor.AtagExporter;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.storage.model.*;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
@Singleton
|
||||
public class ExportAtagsActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final ExporterIf atagExporter;
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final MqPersistence persistence;
|
||||
|
||||
public record Export(long responseMsgId, FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(long responseMsgId,FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep {
|
||||
public Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId) {
|
||||
this(responseMsgId, crawlId, destId, -1);
|
||||
}
|
||||
}
|
||||
public record Fail(long responseMsgId, String message) implements ActorStep {}
|
||||
|
||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now());
|
||||
case Export(long responseMsgId, FileStorageId crawlId) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ACK);
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atags-export", "Atags " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Fail(responseMsgId, "Bad storage id");
|
||||
|
||||
yield new Run(responseMsgId, crawlId, storage.id());
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
try {
|
||||
atagExporter.export(crawlId, destId);
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
||||
yield new Error("Failed to export data");
|
||||
}
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.atags(crawlId, destId));
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
yield new End();
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
yield new Fail(responseMsgId, "Exporter failed");
|
||||
}
|
||||
else {
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
}
|
||||
case Fail(long responseMsgId, String message) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ERR);
|
||||
yield new Error(message);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export anchor tags from crawl data";
|
||||
@@ -55,11 +83,15 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public ExportAtagsActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
AtagExporter atagExporter)
|
||||
ProcessOutboxes processOutboxes,
|
||||
MqPersistence persistence,
|
||||
ActorProcessWatcher processWatcher)
|
||||
{
|
||||
super(gson);
|
||||
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||
this.storageService = storageService;
|
||||
this.atagExporter = atagExporter;
|
||||
this.persistence = persistence;
|
||||
this.processWatcher = processWatcher;
|
||||
}
|
||||
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user