mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
425 Commits
deploy-002
...
deploy-027
Author | SHA1 | Date | |
---|---|---|---|
|
390f053406 | ||
|
b03c43224c | ||
|
9b4ce9e9eb | ||
|
81ac02a695 | ||
|
47f624fb3b | ||
|
c866f19cbb | ||
|
518278493b | ||
|
1ac0bab0b8 | ||
|
08b45ed10a | ||
|
f2cfb91973 | ||
|
2f79524eb3 | ||
|
3b00142c96 | ||
|
294ab19177 | ||
|
6f1659ecb2 | ||
|
982dcb28f0 | ||
|
fc686d8b2e | ||
|
69ef0f334a | ||
|
446746f3bd | ||
|
24ab8398bb | ||
|
d2ceeff4cf | ||
|
cf64214b1c | ||
|
e50d09cc01 | ||
|
bce3892ce0 | ||
|
36581b25c2 | ||
|
52ff7fb4dd | ||
|
a4e49e658a | ||
|
e2c56dc3ca | ||
|
470b866008 | ||
|
4895a2ac7a | ||
|
fd32ae9fa7 | ||
|
470651ea4c | ||
|
8d4829e783 | ||
|
1290bc15dc | ||
|
e7fa558954 | ||
|
720685bf3f | ||
|
cbec63c7da | ||
|
b03ca75785 | ||
|
184aedc071 | ||
|
0275bad281 | ||
|
fd83a9d0b8 | ||
|
d556f8ae3a | ||
|
e37559837b | ||
|
3564c4aaee | ||
|
92c54563ab | ||
|
d7a5d90b07 | ||
|
0a0e88fd6e | ||
|
b4fc0c4368 | ||
|
87ee8765b8 | ||
|
1adf4835fa | ||
|
b7b5d0bf46 | ||
|
416059adde | ||
|
db7930016a | ||
|
82456ad673 | ||
|
0882a6d9cd | ||
|
5020029c2d | ||
|
ac44d0b093 | ||
|
4b32b9b10e | ||
|
9f041d6631 | ||
|
13fb1efce4 | ||
|
c1225165b7 | ||
|
67ad7a3bbc | ||
|
ed62ec8a35 | ||
|
42b24cfa34 | ||
|
1ffaab2da6 | ||
|
5f93c7f767 | ||
|
4001c68c82 | ||
|
6b811489c5 | ||
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 | ||
|
9afed0a18e | ||
|
afad4deb94 | ||
|
f071c947e4 | ||
|
79996c9348 | ||
|
db907ab06a | ||
|
c49cd9dd95 | ||
|
eec9df3b0a | ||
|
e5f3288de6 | ||
|
d587544d3a | ||
|
1a9ae1bc40 | ||
|
e0c81e956a | ||
|
542fb12b38 | ||
|
65ec734566 | ||
|
10b6a25c63 | ||
|
6260f6bec7 | ||
|
d6d5467696 | ||
|
034560ca75 | ||
|
e994fddae4 | ||
|
345f01f306 | ||
|
5a8e286689 | ||
|
39a055aa94 | ||
|
37aaa90dc9 | ||
|
24022c5adc | ||
|
1de9ecc0b6 | ||
|
9b80245ea0 | ||
|
4e1595c1a6 | ||
|
0be8585fa5 | ||
|
a0fe070fe7 | ||
|
abe9da0fc6 | ||
|
56d0128b0a | ||
|
840b68ac55 | ||
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
be039d1a8c | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 | ||
|
599534806b | ||
|
7e8253dac7 | ||
|
97a6780ea3 | ||
|
eb634beec8 | ||
|
269ebd1654 | ||
|
39ce40bfeb | ||
|
c187b2e1c1 | ||
|
42eaa4588b | ||
|
4f40a5fbeb | ||
|
3f3d42bc01 | ||
|
61c8d53e1b | ||
|
a7a3d85be9 | ||
|
306232fb54 | ||
|
5aef844f0d | ||
|
d56b5c828a | ||
|
ab58a4636f | ||
|
00be269238 | ||
|
879e6a9424 | ||
|
fba3455732 | ||
|
14283da7f5 | ||
|
93df4d1fc0 | ||
|
b12a0b998c | ||
|
3b6f4e321b | ||
|
8428111771 | ||
|
e9fd4415ef | ||
|
4c95c3dcad | ||
|
c5281536fb | ||
|
4431dae7ac | ||
|
4df4d0a7a8 | ||
|
9f05083b94 | ||
|
fc92e9b9c0 | ||
|
328fb5d927 | ||
|
36889950e8 | ||
|
c96a94878b | ||
|
1c57d7d73a | ||
|
a443d22356 | ||
|
aa59d4afa4 | ||
|
df0f18d0e7 | ||
|
0819d46f97 | ||
|
5e2b63473e | ||
|
f9590703f1 | ||
|
f12fc11337 | ||
|
c309030184 | ||
|
fd5af01629 | ||
|
d4c43c7a79 | ||
|
18700e1919 | ||
|
120b431998 | ||
|
71dad99326 | ||
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 | ||
|
413d5cc788 | ||
|
58539b92ac | ||
|
fe72f16df1 | ||
|
b49a244a2e | ||
|
3f0b4c010f | ||
|
c6e0cd93f7 | ||
|
80a7ccb080 | ||
|
54dec347c4 | ||
|
d6ee3f0785 | ||
|
8be88afcf3 | ||
|
0e3c00d3e1 | ||
|
4279a7f1aa | ||
|
251006d4f9 | ||
|
c3e99dc12a | ||
|
aaaa2de022 | ||
|
fc1388422a | ||
|
b07080db16 | ||
|
e9d86dca4a | ||
|
1d693f0efa | ||
|
5874a163dc | ||
|
5ec7a1deab | ||
|
7fea2808ed | ||
|
8da74484f0 | ||
|
923d5a7234 | ||
|
58f88749b8 | ||
|
77f727a5ba | ||
|
667cfb53dc | ||
|
fe36d4ed20 | ||
|
acf4bef98d | ||
|
2a737c34bb | ||
|
90a577af82 | ||
|
f0c9b935d8 | ||
|
7b5493dd51 | ||
|
c246a59158 | ||
|
0b99781d24 | ||
|
39db9620c1 | ||
|
1781599363 | ||
|
6b2d18fb9b | ||
|
59b1d200ab | ||
|
897010a2cf | ||
|
602af7a77e | ||
|
a7d91c8527 | ||
|
7151602124 | ||
|
884e33bd4a | ||
|
e84d5c497a | ||
|
2d2d3e2466 | ||
|
647dd9b12f | ||
|
de4e2849ce | ||
|
3c43f1954e | ||
|
fa2462ec39 | ||
|
f4ad7145db | ||
|
068b450180 | ||
|
05b909a21f | ||
|
3d179cddce | ||
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce | ||
|
500f63e921 | ||
|
6dfbedda1e | ||
|
9715ddb105 | ||
|
1fc6313a77 | ||
|
b1249d5b8a | ||
|
ef95d59b07 | ||
|
acdd8664f5 | ||
|
6b12eac58a | ||
|
bb3f1f395a | ||
|
b661beef41 | ||
|
9888c47f19 | ||
|
dcef7e955b | ||
|
b3973a1dd7 | ||
|
8bd05d6d90 | ||
|
59df8e356e | ||
|
7161162a35 | ||
|
d7c4c5141f | ||
|
88e9b8fb05 | ||
|
b6265cee11 | ||
|
c91af247e9 | ||
|
7a31227de1 | ||
|
4f477604c5 | ||
|
2970f4395b | ||
|
d1ec909b36 | ||
|
c67c5bbf42 | ||
|
ecb0e57a1a | ||
|
8c61f61b46 | ||
|
662a18c933 | ||
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 | ||
|
0f3b24d0f8 | ||
|
a732095d2a | ||
|
6607f0112f | ||
|
4913730de9 | ||
|
1db64f9d56 | ||
|
4dcff14498 | ||
|
426658f64e | ||
|
2181b22f05 | ||
|
42bd79a609 | ||
|
b91c1e528a | ||
|
b1130d7a04 | ||
|
8364bcdc97 | ||
|
626cab5fab | ||
|
cfd4712191 | ||
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca | ||
|
78cc25584a | ||
|
62ba30bacf | ||
|
3bb84eb206 | ||
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a | ||
|
20ca41ec95 | ||
|
7671f0d9e4 | ||
|
44d6bc71b7 | ||
|
9d302e2973 | ||
|
f553701224 | ||
|
f076d05595 | ||
|
b513809710 | ||
|
7519b28e21 | ||
|
3eac4dd57f | ||
|
4c2810720a | ||
|
8480ba8daa | ||
|
fbba392491 | ||
|
530eb35949 | ||
|
c2dd2175a2 | ||
|
b8581b0f56 | ||
|
2ea34767d8 | ||
|
e9af838231 | ||
|
ae0cad47c4 | ||
|
5fbc8ef998 | ||
|
32c6dd9e6a | ||
|
6ece6a6cfb | ||
|
39cd1c18f8 | ||
|
eb65daaa88 | ||
|
0bebdb6e33 | ||
|
1e50e392c6 | ||
|
fb673de370 | ||
|
eee73ab16c | ||
|
5354e034bf | ||
|
72384ad6ca | ||
|
a2b076f9be | ||
|
c8b0a32c0f | ||
|
f0d74aa3bb | ||
|
74a1f100f4 | ||
|
eb049658e4 | ||
|
db138b2a6f | ||
|
1673fc284c | ||
|
503ea57d5b | ||
|
18ca926c7f | ||
|
db99242db2 | ||
|
2b9d2985ba | ||
|
eeb6ecd711 | ||
|
1f58aeadbf | ||
|
3d68be64da | ||
|
668f3b16ef | ||
|
98a340a0d1 | ||
|
8862100f7e | ||
|
274941f6de | ||
|
abec83582d | ||
|
569520c9b6 | ||
|
088310e998 | ||
|
270cab874b | ||
|
4c74e280d3 | ||
|
5b347e17ac | ||
|
55d6ab933f | ||
|
43b74e9706 | ||
|
579a115243 | ||
|
2c67f50a43 | ||
|
78a958e2b0 | ||
|
4e939389b2 | ||
|
e67a9bdb91 | ||
|
567e4e1237 | ||
|
4342e42722 | ||
|
bc818056e6 | ||
|
de2feac238 | ||
|
1e770205a5 | ||
|
e44ecd6d69 | ||
|
5b93a0e633 | ||
|
08fb0e5efe | ||
|
bcf67782ea | ||
|
ef3f175ede | ||
|
bbe4b5d9fd | ||
|
c67a635103 | ||
|
20b24133fb | ||
|
f2567677e8 | ||
|
bc2c2061f2 | ||
|
1c7f5a31a5 | ||
|
59a8ea60f7 | ||
|
aa9b1244ea | ||
|
2d17233366 | ||
|
b245cc9f38 | ||
|
6614d05bdf | ||
|
55aeb03c4a | ||
|
faa589962f | ||
|
c7edd6b39f | ||
|
79da622e3b | ||
|
3da8337ba6 | ||
|
a32d230f0a | ||
|
3772bfd387 | ||
|
02a7900d1a | ||
|
a1fb92468f | ||
|
b7f0a2a98e | ||
|
5fb76b2e79 | ||
|
ad8c97f342 | ||
|
dc1b6373eb | ||
|
983d6d067c | ||
|
a84a06975c | ||
|
d2864c13ec | ||
|
03ba53ce51 | ||
|
d4a6684931 | ||
|
6f0485287a | ||
|
59e2dd4c26 | ||
|
ca1807caae | ||
|
26c20e18ac | ||
|
7c90b6b414 | ||
|
b63c54c4ce | ||
|
fecd2f4ec3 | ||
|
39e420de88 | ||
|
dc83619861 | ||
|
87d1c89701 | ||
|
a42a7769e2 | ||
|
202bda884f | ||
|
2315fdc731 | ||
|
47e58a21c6 | ||
|
3714104976 | ||
|
f6f036b9b1 | ||
|
b510b7feb8 |
55
ROADMAP.md
55
ROADMAP.md
@@ -1,4 +1,4 @@
|
|||||||
# Roadmap 2024-2025
|
# Roadmap 2025
|
||||||
|
|
||||||
This is a roadmap with major features planned for Marginalia Search.
|
This is a roadmap with major features planned for Marginalia Search.
|
||||||
|
|
||||||
@@ -30,12 +30,6 @@ Retaining the ability to independently crawl the web is still strongly desirable
|
|||||||
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||||
|
|
||||||
## Web Design Overhaul
|
|
||||||
|
|
||||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
|
||||||
|
|
||||||
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
|
||||||
|
|
||||||
## Additional Language Support
|
## Additional Language Support
|
||||||
|
|
||||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||||
@@ -44,14 +38,6 @@ associated with each language added, at least a models file or two, as well as s
|
|||||||
|
|
||||||
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
||||||
|
|
||||||
## Support for binary formats like PDF
|
|
||||||
|
|
||||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
|
||||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
|
||||||
|
|
||||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
|
||||||
that direction as well.
|
|
||||||
|
|
||||||
## Custom ranking logic
|
## Custom ranking logic
|
||||||
|
|
||||||
Stract does an interesting thing where they have configurable search filters.
|
Stract does an interesting thing where they have configurable search filters.
|
||||||
@@ -62,8 +48,39 @@ filter for any API consumer.
|
|||||||
|
|
||||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||||
|
|
||||||
|
## Show favicons next to search results
|
||||||
|
|
||||||
|
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||||
|
|
||||||
|
## Specialized crawler for github
|
||||||
|
|
||||||
|
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||||
|
|
||||||
# Completed
|
# Completed
|
||||||
|
|
||||||
|
## Support for binary formats like PDF (COMPLETED 2025-05)
|
||||||
|
|
||||||
|
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||||
|
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||||
|
|
||||||
|
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||||
|
that direction as well.
|
||||||
|
|
||||||
|
## Web Design Overhaul (COMPLETED 2025-01)
|
||||||
|
|
||||||
|
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||||
|
|
||||||
|
PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127)
|
||||||
|
|
||||||
|
## Finalize RSS support (COMPLETED 2024-11)
|
||||||
|
|
||||||
|
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||||
|
it should be extended to all domains. It would also be interesting to offer search of the
|
||||||
|
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||||
|
main dataset.
|
||||||
|
|
||||||
|
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||||
|
|
||||||
## Proper Position Index (COMPLETED 2024-09)
|
## Proper Position Index (COMPLETED 2024-09)
|
||||||
|
|
||||||
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
||||||
@@ -76,11 +93,3 @@ list, as is the civilized way of doing this.
|
|||||||
|
|
||||||
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||||
|
|
||||||
## Finalize RSS support (COMPLETED 2024-11)
|
|
||||||
|
|
||||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
|
||||||
it should be extended to all domains. It would also be interesting to offer search of the
|
|
||||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
|
||||||
main dataset.
|
|
||||||
|
|
||||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
|
||||||
|
@@ -5,7 +5,7 @@ plugins {
|
|||||||
|
|
||||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
|
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
group 'marginalia'
|
group 'marginalia'
|
||||||
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
|
|||||||
}
|
}
|
||||||
|
|
||||||
ext {
|
ext {
|
||||||
jvmVersion=23
|
jvmVersion = 24
|
||||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||||
dockerImageTag='latest'
|
dockerImageTag='latest'
|
||||||
dockerImageRegistry='marginalia'
|
dockerImageRegistry='marginalia'
|
||||||
jibVersion = '3.4.3'
|
jibVersion = '3.4.5'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
idea {
|
idea {
|
||||||
|
@@ -24,58 +24,4 @@ public class LanguageModels {
|
|||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||||
this.segments = segments;
|
this.segments = segments;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static LanguageModelsBuilder builder() {
|
|
||||||
return new LanguageModelsBuilder();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class LanguageModelsBuilder {
|
|
||||||
private Path termFrequencies;
|
|
||||||
private Path openNLPSentenceDetectionData;
|
|
||||||
private Path posRules;
|
|
||||||
private Path posDict;
|
|
||||||
private Path fasttextLanguageModel;
|
|
||||||
private Path segments;
|
|
||||||
|
|
||||||
LanguageModelsBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
|
||||||
this.termFrequencies = termFrequencies;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
|
||||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posRules(Path posRules) {
|
|
||||||
this.posRules = posRules;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posDict(Path posDict) {
|
|
||||||
this.posDict = posDict;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
|
||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder segments(Path segments) {
|
|
||||||
this.segments = segments;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModels build() {
|
|
||||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -1,3 +1,8 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A record representing a User Agent.
|
||||||
|
* @param uaString - the header value of the User Agent
|
||||||
|
* @param uaIdentifier - what we look for in robots.txt
|
||||||
|
*/
|
||||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||||
|
@@ -45,7 +45,7 @@ public class NodeConfigurationService {
|
|||||||
public List<NodeConfiguration> getAll() {
|
public List<NodeConfiguration> getAll() {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var qs = conn.prepareStatement("""
|
var qs = conn.prepareStatement("""
|
||||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||||
FROM NODE_CONFIGURATION
|
FROM NODE_CONFIGURATION
|
||||||
""")) {
|
""")) {
|
||||||
var rs = qs.executeQuery();
|
var rs = qs.executeQuery();
|
||||||
@@ -59,6 +59,7 @@ public class NodeConfigurationService {
|
|||||||
rs.getBoolean("ACCEPT_QUERIES"),
|
rs.getBoolean("ACCEPT_QUERIES"),
|
||||||
rs.getBoolean("AUTO_CLEAN"),
|
rs.getBoolean("AUTO_CLEAN"),
|
||||||
rs.getBoolean("PRECESSION"),
|
rs.getBoolean("PRECESSION"),
|
||||||
|
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||||
rs.getBoolean("KEEP_WARCS"),
|
rs.getBoolean("KEEP_WARCS"),
|
||||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||||
rs.getBoolean("DISABLED")
|
rs.getBoolean("DISABLED")
|
||||||
@@ -75,7 +76,7 @@ public class NodeConfigurationService {
|
|||||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var qs = conn.prepareStatement("""
|
var qs = conn.prepareStatement("""
|
||||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||||
FROM NODE_CONFIGURATION
|
FROM NODE_CONFIGURATION
|
||||||
WHERE ID=?
|
WHERE ID=?
|
||||||
""")) {
|
""")) {
|
||||||
@@ -88,6 +89,7 @@ public class NodeConfigurationService {
|
|||||||
rs.getBoolean("ACCEPT_QUERIES"),
|
rs.getBoolean("ACCEPT_QUERIES"),
|
||||||
rs.getBoolean("AUTO_CLEAN"),
|
rs.getBoolean("AUTO_CLEAN"),
|
||||||
rs.getBoolean("PRECESSION"),
|
rs.getBoolean("PRECESSION"),
|
||||||
|
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||||
rs.getBoolean("KEEP_WARCS"),
|
rs.getBoolean("KEEP_WARCS"),
|
||||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||||
rs.getBoolean("DISABLED")
|
rs.getBoolean("DISABLED")
|
||||||
@@ -102,7 +104,7 @@ public class NodeConfigurationService {
|
|||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var us = conn.prepareStatement("""
|
var us = conn.prepareStatement("""
|
||||||
UPDATE NODE_CONFIGURATION
|
UPDATE NODE_CONFIGURATION
|
||||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, AUTO_ASSIGN_DOMAINS=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||||
WHERE ID=?
|
WHERE ID=?
|
||||||
"""))
|
"""))
|
||||||
{
|
{
|
||||||
@@ -110,10 +112,11 @@ public class NodeConfigurationService {
|
|||||||
us.setBoolean(2, config.acceptQueries());
|
us.setBoolean(2, config.acceptQueries());
|
||||||
us.setBoolean(3, config.autoClean());
|
us.setBoolean(3, config.autoClean());
|
||||||
us.setBoolean(4, config.includeInPrecession());
|
us.setBoolean(4, config.includeInPrecession());
|
||||||
us.setBoolean(5, config.keepWarcs());
|
us.setBoolean(5, config.autoAssignDomains());
|
||||||
us.setBoolean(6, config.disabled());
|
us.setBoolean(6, config.keepWarcs());
|
||||||
us.setString(7, config.profile().name());
|
us.setBoolean(7, config.disabled());
|
||||||
us.setInt(8, config.node());
|
us.setString(8, config.profile().name());
|
||||||
|
us.setInt(9, config.node());
|
||||||
|
|
||||||
if (us.executeUpdate() <= 0)
|
if (us.executeUpdate() <= 0)
|
||||||
throw new IllegalStateException("Failed to update configuration");
|
throw new IllegalStateException("Failed to update configuration");
|
||||||
|
@@ -5,6 +5,7 @@ public record NodeConfiguration(int node,
|
|||||||
boolean acceptQueries,
|
boolean acceptQueries,
|
||||||
boolean autoClean,
|
boolean autoClean,
|
||||||
boolean includeInPrecession,
|
boolean includeInPrecession,
|
||||||
|
boolean autoAssignDomains,
|
||||||
boolean keepWarcs,
|
boolean keepWarcs,
|
||||||
NodeProfile profile,
|
NodeProfile profile,
|
||||||
boolean disabled
|
boolean disabled
|
||||||
|
@@ -20,9 +20,7 @@ public enum NodeProfile {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean permitBatchCrawl() {
|
public boolean permitBatchCrawl() {
|
||||||
return isBatchCrawl() ||isMixed();
|
return isBatchCrawl() || isMixed();
|
||||||
}
|
|
||||||
public boolean permitSideload() {
|
|
||||||
return isMixed() || isSideload();
|
|
||||||
}
|
}
|
||||||
|
public boolean permitSideload() { return isSideload() || isMixed(); }
|
||||||
}
|
}
|
||||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
|||||||
|
|
||||||
import com.zaxxer.hikari.HikariConfig;
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.test.TestMigrationLoader;
|
import nu.marginalia.test.TestMigrationLoader;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
@@ -62,6 +63,63 @@ public class NodeConfigurationServiceTest {
|
|||||||
assertEquals(2, list.size());
|
assertEquals(2, list.size());
|
||||||
assertEquals(a, list.get(0));
|
assertEquals(a, list.get(0));
|
||||||
assertEquals(b, list.get(1));
|
assertEquals(b, list.get(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Test all the fields that are only exposed via save()
|
||||||
|
@Test
|
||||||
|
public void testSaveChanges() throws SQLException {
|
||||||
|
var original = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||||
|
|
||||||
|
assertEquals(1, original.node());
|
||||||
|
assertEquals("Test", original.description());
|
||||||
|
assertFalse(original.acceptQueries());
|
||||||
|
|
||||||
|
var precession = new NodeConfiguration(
|
||||||
|
original.node(),
|
||||||
|
"Foo",
|
||||||
|
true,
|
||||||
|
original.autoClean(),
|
||||||
|
original.includeInPrecession(),
|
||||||
|
!original.autoAssignDomains(),
|
||||||
|
original.keepWarcs(),
|
||||||
|
original.profile(),
|
||||||
|
original.disabled()
|
||||||
|
);
|
||||||
|
|
||||||
|
nodeConfigurationService.save(precession);
|
||||||
|
precession = nodeConfigurationService.get(original.node());
|
||||||
|
assertNotEquals(original.autoAssignDomains(), precession.autoAssignDomains());
|
||||||
|
|
||||||
|
var autoClean = new NodeConfiguration(
|
||||||
|
original.node(),
|
||||||
|
"Foo",
|
||||||
|
true,
|
||||||
|
!original.autoClean(),
|
||||||
|
original.includeInPrecession(),
|
||||||
|
original.autoAssignDomains(),
|
||||||
|
original.keepWarcs(),
|
||||||
|
original.profile(),
|
||||||
|
original.disabled()
|
||||||
|
);
|
||||||
|
|
||||||
|
nodeConfigurationService.save(autoClean);
|
||||||
|
autoClean = nodeConfigurationService.get(original.node());
|
||||||
|
assertNotEquals(original.autoClean(), autoClean.autoClean());
|
||||||
|
|
||||||
|
var disabled = new NodeConfiguration(
|
||||||
|
original.node(),
|
||||||
|
"Foo",
|
||||||
|
true,
|
||||||
|
autoClean.autoClean(),
|
||||||
|
autoClean.includeInPrecession(),
|
||||||
|
autoClean.autoAssignDomains(),
|
||||||
|
autoClean.keepWarcs(),
|
||||||
|
autoClean.profile(),
|
||||||
|
!autoClean.disabled()
|
||||||
|
);
|
||||||
|
nodeConfigurationService.save(disabled);
|
||||||
|
disabled = nodeConfigurationService.get(original.node());
|
||||||
|
assertNotEquals(autoClean.disabled(), disabled.disabled());
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -8,18 +8,23 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.*;
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.OptionalInt;
|
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class DbDomainQueries {
|
public class DbDomainQueries {
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||||
|
|
||||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DbDomainQueries(HikariDataSource dataSource)
|
public DbDomainQueries(HikariDataSource dataSource)
|
||||||
@@ -29,16 +34,21 @@ public class DbDomainQueries {
|
|||||||
|
|
||||||
|
|
||||||
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
||||||
try (var connection = dataSource.getConnection()) {
|
try {
|
||||||
|
|
||||||
return domainIdCache.get(domain, () -> {
|
return domainIdCache.get(domain, () -> {
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
try (var connection = dataSource.getConnection();
|
||||||
|
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
|
|
||||||
stmt.setString(1, domain.toString());
|
stmt.setString(1, domain.toString());
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
return rsp.getInt(1);
|
return rsp.getInt(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
throw new NoSuchElementException();
|
throw new NoSuchElementException();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -48,8 +58,33 @@ public class DbDomainQueries {
|
|||||||
catch (ExecutionException ex) {
|
catch (ExecutionException ex) {
|
||||||
throw new RuntimeException(ex.getCause());
|
throw new RuntimeException(ex.getCause());
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
}
|
||||||
throw new RuntimeException(ex);
|
|
||||||
|
|
||||||
|
public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
|
||||||
|
try {
|
||||||
|
return domainWithNodeCache.get(domain, () -> {
|
||||||
|
try (var connection = dataSource.getConnection();
|
||||||
|
var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
|
|
||||||
|
stmt.setString(1, domain.toString());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
catch (UncheckedExecutionException ex) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
catch (ExecutionException ex) {
|
||||||
|
throw new RuntimeException(ex.getCause());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -83,22 +118,62 @@ public class DbDomainQueries {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Optional<EdgeDomain> getDomain(int id) {
|
public Optional<EdgeDomain> getDomain(int id) {
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
|
EdgeDomain existing = domainNameCache.getIfPresent(id);
|
||||||
|
if (existing != null) {
|
||||||
|
return Optional.of(existing);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||||
stmt.setInt(1, id);
|
stmt.setInt(1, id);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
return Optional.of(new EdgeDomain(rsp.getString(1)));
|
var val = new EdgeDomain(rsp.getString(1));
|
||||||
|
domainNameCache.put(id, val);
|
||||||
|
return Optional.of(val);
|
||||||
}
|
}
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (UncheckedExecutionException ex) {
|
|
||||||
throw new RuntimeException(ex.getCause());
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
throw new RuntimeException(ex);
|
throw new RuntimeException(ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) throws ExecutionException {
|
||||||
|
String topDomain = domain.topDomain;
|
||||||
|
|
||||||
|
return siblingsCache.get(topDomain, () -> {
|
||||||
|
List<DomainWithNode> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
||||||
|
stmt.setString(1, topDomain);
|
||||||
|
stmt.setInt(2, cnt);
|
||||||
|
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
var sibling = new EdgeDomain(rs.getString(1));
|
||||||
|
|
||||||
|
if (sibling.equals(domain))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ret.add(new DomainWithNode(sibling, rs.getInt(2)));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to get domain neighbors");
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
|
||||||
|
public boolean isIndexed() {
|
||||||
|
return nodeAffinity > 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public record DomainIdWithNode (int domainId, int nodeAffinity) { }
|
||||||
}
|
}
|
||||||
|
@@ -1,118 +0,0 @@
|
|||||||
package nu.marginalia.db;
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
|
|
||||||
import java.sql.Connection;
|
|
||||||
import java.sql.PreparedStatement;
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.OptionalInt;
|
|
||||||
|
|
||||||
/** Class used in exporting data. This is intended to be used for a brief time
|
|
||||||
* and then discarded, not kept around as a service.
|
|
||||||
*/
|
|
||||||
public class DbDomainStatsExportMultitool implements AutoCloseable {
|
|
||||||
private final Connection connection;
|
|
||||||
private final int nodeId;
|
|
||||||
private final PreparedStatement knownUrlsQuery;
|
|
||||||
private final PreparedStatement visitedUrlsQuery;
|
|
||||||
private final PreparedStatement goodUrlsQuery;
|
|
||||||
private final PreparedStatement domainNameToId;
|
|
||||||
|
|
||||||
private final PreparedStatement allDomainsQuery;
|
|
||||||
private final PreparedStatement crawlQueueDomains;
|
|
||||||
private final PreparedStatement indexedDomainsQuery;
|
|
||||||
|
|
||||||
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
|
|
||||||
this.connection = dataSource.getConnection();
|
|
||||||
this.nodeId = nodeId;
|
|
||||||
|
|
||||||
knownUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
visitedUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT VISITED_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
goodUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT GOOD_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
domainNameToId = connection.prepareStatement("""
|
|
||||||
SELECT ID
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
allDomainsQuery = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
""");
|
|
||||||
crawlQueueDomains = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM CRAWL_QUEUE
|
|
||||||
""");
|
|
||||||
indexedDomainsQuery = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
WHERE INDEXED > 0
|
|
||||||
""");
|
|
||||||
}
|
|
||||||
|
|
||||||
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
|
|
||||||
return executeNameToIntQuery(domainName, visitedUrlsQuery);
|
|
||||||
}
|
|
||||||
|
|
||||||
public OptionalInt getDomainId(String domainName) throws SQLException {
|
|
||||||
return executeNameToIntQuery(domainName, domainNameToId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getCrawlQueueDomains() throws SQLException {
|
|
||||||
return executeListQuery(crawlQueueDomains, 100);
|
|
||||||
}
|
|
||||||
public List<String> getAllIndexedDomains() throws SQLException {
|
|
||||||
return executeListQuery(indexedDomainsQuery, 100_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
|
|
||||||
throws SQLException {
|
|
||||||
statement.setString(1, domainName);
|
|
||||||
var rs = statement.executeQuery();
|
|
||||||
|
|
||||||
if (rs.next()) {
|
|
||||||
return OptionalInt.of(rs.getInt(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
return OptionalInt.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
|
|
||||||
List<String> ret = new ArrayList<>(sizeHint);
|
|
||||||
|
|
||||||
var rs = statement.executeQuery();
|
|
||||||
|
|
||||||
while (rs.next()) {
|
|
||||||
ret.add(rs.getString(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws SQLException {
|
|
||||||
knownUrlsQuery.close();
|
|
||||||
goodUrlsQuery.close();
|
|
||||||
visitedUrlsQuery.close();
|
|
||||||
allDomainsQuery.close();
|
|
||||||
crawlQueueDomains.close();
|
|
||||||
domainNameToId.close();
|
|
||||||
connection.close();
|
|
||||||
}
|
|
||||||
}
|
|
@@ -0,0 +1,5 @@
|
|||||||
|
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
|
||||||
|
ID INT NOT NULL AUTO_INCREMENT,
|
||||||
|
TIER INT NOT NULL,
|
||||||
|
PRIMARY KEY (ID)
|
||||||
|
);
|
@@ -0,0 +1,213 @@
|
|||||||
|
|
||||||
|
-- Create metadata tables for domain ping status and security information
|
||||||
|
|
||||||
|
-- These are not ICMP pings, but rather HTTP(S) pings to check the availability and security
|
||||||
|
-- of web servers associated with domains, to assess uptime and changes in security configurations
|
||||||
|
-- indicating ownership changes or security issues.
|
||||||
|
|
||||||
|
-- Note: DOMAIN_ID and NODE_ID are used to identify the domain and the node that performed the ping.
|
||||||
|
-- These are strictly speaking foreign keys to the EC_DOMAIN table, but as it
|
||||||
|
-- is strictly append-only, we do not need to enforce foreign key constraints.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION (
|
||||||
|
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
SERVER_AVAILABLE BOOLEAN NOT NULL, -- Indicates if the server is available (true) or not (false)
|
||||||
|
SERVER_IP VARBINARY(16), -- IP address of the server (IPv4 or IPv6)
|
||||||
|
SERVER_IP_ASN INTEGER, -- Autonomous System number
|
||||||
|
|
||||||
|
DATA_HASH BIGINT, -- Hash of the data for integrity checks
|
||||||
|
SECURITY_CONFIG_HASH BIGINT, -- Hash of the security configuration for integrity checks
|
||||||
|
|
||||||
|
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||||
|
HTTP_ETAG VARCHAR(255), -- ETag of the resource as per HTTP headers
|
||||||
|
HTTP_LAST_MODIFIED VARCHAR(255), -- Last modified date of the resource as per HTTP headers
|
||||||
|
HTTP_STATUS INT, -- HTTP status code (e.g., 200, 404, etc.)
|
||||||
|
HTTP_LOCATION VARCHAR(255), -- If the server redirects, this is the location of the redirect
|
||||||
|
HTTP_RESPONSE_TIME_MS SMALLINT UNSIGNED, -- Response time in milliseconds
|
||||||
|
|
||||||
|
ERROR_CLASSIFICATION ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN'), -- Classification of the error if the server is not available
|
||||||
|
ERROR_MESSAGE VARCHAR(255), -- Error message if the server is not available
|
||||||
|
|
||||||
|
TS_LAST_PING TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the last ping
|
||||||
|
TS_LAST_AVAILABLE TIMESTAMP, -- Timestamp of the last time the server was available
|
||||||
|
TS_LAST_ERROR TIMESTAMP, -- Timestamp of the last error encountered
|
||||||
|
|
||||||
|
NEXT_SCHEDULED_UPDATE TIMESTAMP NOT NULL,
|
||||||
|
BACKOFF_CONSECUTIVE_FAILURES INT NOT NULL DEFAULT 0, -- Number of consecutive failures to ping the server
|
||||||
|
BACKOFF_FETCH_INTERVAL INT NOT NULL DEFAULT 60 -- Interval in seconds for the next scheduled ping
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NEXT_SCHEDULED_UPDATE_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, NEXT_SCHEDULED_UPDATE);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_INFORMATION (
|
||||||
|
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
ASN INTEGER, -- Autonomous System Number (ASN) of the server
|
||||||
|
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||||
|
HTTP_VERSION VARCHAR(10), -- HTTP version used (e.g., HTTP/1.1, HTTP/2)
|
||||||
|
HTTP_COMPRESSION VARCHAR(50), -- Compression method used (e.g., gzip, deflate, br)
|
||||||
|
HTTP_CACHE_CONTROL TEXT, -- Cache control directives from HTTP headers
|
||||||
|
|
||||||
|
SSL_CERT_NOT_BEFORE TIMESTAMP, -- Valid from date (usually same as issued)
|
||||||
|
SSL_CERT_NOT_AFTER TIMESTAMP, -- Valid until date (usually same as expires)
|
||||||
|
|
||||||
|
SSL_CERT_ISSUER VARCHAR(255), -- CA that issued the cert
|
||||||
|
SSL_CERT_SUBJECT VARCHAR(255), -- Certificate subject/CN
|
||||||
|
|
||||||
|
SSL_CERT_PUBLIC_KEY_HASH BINARY(32), -- SHA-256 hash of the public key
|
||||||
|
SSL_CERT_SERIAL_NUMBER VARCHAR(100), -- Unique cert serial number
|
||||||
|
SSL_CERT_FINGERPRINT_SHA256 BINARY(32), -- SHA-256 fingerprint for exact identification
|
||||||
|
SSL_CERT_SAN TEXT, -- Subject Alternative Names (JSON array)
|
||||||
|
SSL_CERT_WILDCARD BOOLEAN, -- Wildcard certificate (*.example.com)
|
||||||
|
|
||||||
|
SSL_PROTOCOL VARCHAR(20), -- TLS 1.2, TLS 1.3, etc.
|
||||||
|
SSL_CIPHER_SUITE VARCHAR(100), -- e.g., TLS_AES_256_GCM_SHA384
|
||||||
|
SSL_KEY_EXCHANGE VARCHAR(50), -- ECDHE, RSA, etc.
|
||||||
|
SSL_CERTIFICATE_CHAIN_LENGTH TINYINT, -- Number of certs in chain
|
||||||
|
|
||||||
|
SSL_CERTIFICATE_VALID BOOLEAN, -- Valid cert chain
|
||||||
|
|
||||||
|
HEADER_CORS_ALLOW_ORIGIN TEXT, -- Could be *, specific domains, or null
|
||||||
|
HEADER_CORS_ALLOW_CREDENTIALS BOOLEAN, -- Credential handling
|
||||||
|
HEADER_CONTENT_SECURITY_POLICY_HASH INT, -- CSP header, hash of the policy
|
||||||
|
HEADER_STRICT_TRANSPORT_SECURITY VARCHAR(255), -- HSTS header
|
||||||
|
HEADER_REFERRER_POLICY VARCHAR(50), -- Referrer handling
|
||||||
|
HEADER_X_FRAME_OPTIONS VARCHAR(50), -- Clickjacking protection
|
||||||
|
HEADER_X_CONTENT_TYPE_OPTIONS VARCHAR(50), -- MIME sniffing protection
|
||||||
|
HEADER_X_XSS_PROTECTION VARCHAR(50), -- XSS protection header
|
||||||
|
|
||||||
|
HEADER_SERVER VARCHAR(255), -- Server header (e.g., Apache, Nginx, etc.)
|
||||||
|
HEADER_X_POWERED_BY VARCHAR(255), -- X-Powered-By header (if present)
|
||||||
|
|
||||||
|
TS_LAST_UPDATE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -- Timestamp of the last SSL check
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_EVENTS (
|
||||||
|
CHANGE_ID BIGINT AUTO_INCREMENT PRIMARY KEY, -- Unique identifier for the change
|
||||||
|
DOMAIN_ID INT NOT NULL, -- Domain ID, used as a foreign key to EC_DOMAIN
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the change
|
||||||
|
|
||||||
|
CHANGE_ASN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to ASN (Autonomous System Number)
|
||||||
|
CHANGE_CERTIFICATE_FINGERPRINT BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate fingerprint
|
||||||
|
CHANGE_CERTIFICATE_PROFILE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate profile (e.g., algorithm, exchange)
|
||||||
|
CHANGE_CERTIFICATE_SAN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate SAN (Subject Alternative Name)
|
||||||
|
CHANGE_CERTIFICATE_PUBLIC_KEY BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate public key
|
||||||
|
CHANGE_SECURITY_HEADERS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to security headers
|
||||||
|
CHANGE_IP_ADDRESS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to IP address
|
||||||
|
CHANGE_SOFTWARE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to the generator (e.g., web server software)
|
||||||
|
OLD_CERT_TIME_TO_EXPIRY INT, -- Time to expiry of the old certificate in hours, if applicable
|
||||||
|
|
||||||
|
SECURITY_SIGNATURE_BEFORE BLOB NOT NULL, -- Security signature before the change, gzipped json record
|
||||||
|
SECURITY_SIGNATURE_AFTER BLOB NOT NULL -- Security signature after the change, gzipped json record
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_EVENTS (NODE_ID, DOMAIN_ID);
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_SECURITY_EVENTS (TS_CHANGE);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_EVENTS (
|
||||||
|
DOMAIN_ID INT NOT NULL,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
AVAILABLE BOOLEAN NOT NULL, -- True if the service is available, false if it is not
|
||||||
|
OUTAGE_TYPE ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN') NOT NULL,
|
||||||
|
HTTP_STATUS_CODE INT, -- HTTP status code if available (e.g., 200, 404, etc.)
|
||||||
|
ERROR_MESSAGE VARCHAR(255), -- Specific error details
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- Timestamp of the last update
|
||||||
|
|
||||||
|
AVAILABILITY_RECORD_ID BIGINT AUTO_INCREMENT,
|
||||||
|
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||||
|
PRIMARY KEY (AVAILABILITY_RECORD_ID, P_KEY_MONTH)
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||||
|
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||||
|
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||||
|
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||||
|
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||||
|
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||||
|
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||||
|
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||||
|
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||||
|
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||||
|
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||||
|
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||||
|
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||||
|
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__DOMAIN_ID_TS_IDX ON DOMAIN_AVAILABILITY_EVENTS (DOMAIN_ID, TS_CHANGE);
|
||||||
|
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_AVAILABILITY_EVENTS (TS_CHANGE);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_INFORMATION (
|
||||||
|
DNS_ROOT_DOMAIN_ID INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ROOT_DOMAIN_NAME VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
NODE_AFFINITY INT NOT NULL, -- Node ID that performs the DNS check, assign randomly across nodes
|
||||||
|
|
||||||
|
DNS_A_RECORDS TEXT, -- JSON array of IPv4 addresses
|
||||||
|
DNS_AAAA_RECORDS TEXT, -- JSON array of IPv6 addresses
|
||||||
|
DNS_CNAME_RECORD VARCHAR(255), -- Canonical name (if applicable)
|
||||||
|
DNS_MX_RECORDS TEXT, -- JSON array of mail exchange records
|
||||||
|
DNS_CAA_RECORDS TEXT, -- Certificate Authority Authorization
|
||||||
|
DNS_TXT_RECORDS TEXT, -- TXT records (SPF, DKIM, verification, etc.)
|
||||||
|
DNS_NS_RECORDS TEXT, -- Name servers (JSON array)
|
||||||
|
DNS_SOA_RECORD TEXT, -- Start of Authority (JSON object)
|
||||||
|
|
||||||
|
TS_LAST_DNS_CHECK TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
TS_NEXT_DNS_CHECK TIMESTAMP NOT NULL,
|
||||||
|
DNS_CHECK_PRIORITY TINYINT DEFAULT 0 -- Priority of the DNS check, in case we want to schedule a refresh sooner
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_DNS_INFORMATION__PRIORITY_NEXT_CHECK_IDX ON DOMAIN_DNS_INFORMATION (NODE_AFFINITY, DNS_CHECK_PRIORITY DESC, TS_NEXT_DNS_CHECK);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_EVENTS (
|
||||||
|
DNS_ROOT_DOMAIN_ID INT NOT NULL,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
|
||||||
|
-- DNS change type flags
|
||||||
|
CHANGE_A_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv4 address changes
|
||||||
|
CHANGE_AAAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv6 address changes
|
||||||
|
CHANGE_CNAME BOOLEAN NOT NULL DEFAULT FALSE, -- CNAME changes
|
||||||
|
CHANGE_MX_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Mail server changes
|
||||||
|
CHANGE_CAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Certificate authority changes
|
||||||
|
CHANGE_TXT_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- TXT record changes (SPF, DKIM, etc.)
|
||||||
|
CHANGE_NS_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Name server changes (big red flag!)
|
||||||
|
CHANGE_SOA_RECORD BOOLEAN NOT NULL DEFAULT FALSE, -- Start of Authority changes
|
||||||
|
|
||||||
|
DNS_SIGNATURE_BEFORE BLOB NOT NULL, -- Compressed JSON snapshot of DNS records before change
|
||||||
|
DNS_SIGNATURE_AFTER BLOB NOT NULL, -- Compressed JSON snapshot of DNS records after change
|
||||||
|
|
||||||
|
DNS_EVENT_ID BIGINT AUTO_INCREMENT,
|
||||||
|
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||||
|
PRIMARY KEY (DNS_EVENT_ID, P_KEY_MONTH)
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||||
|
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||||
|
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||||
|
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||||
|
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||||
|
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||||
|
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||||
|
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||||
|
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||||
|
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||||
|
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||||
|
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||||
|
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||||
|
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_DNS_EVENTS__DNS_ROOT_DOMAIN_ID_TS_IDX ON DOMAIN_DNS_EVENTS (DNS_ROOT_DOMAIN_ID, TS_CHANGE);
|
||||||
|
CREATE INDEX DOMAIN_DNS_EVENTS__TS_CHANGE_IDX ON DOMAIN_DNS_EVENTS (TS_CHANGE);
|
@@ -0,0 +1,6 @@
|
|||||||
|
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||||
|
-- to make it easier to make sense of certificate changes
|
||||||
|
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_SERIAL_NUMBER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_ISSUER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
|
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,7 @@
|
|||||||
|
-- Add additional summary columns to DOMAIN_SECURITY_INFORMATION table
|
||||||
|
-- to make it easier to get more information about the SSL certificate's validity
|
||||||
|
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_CHAIN_VALID BOOLEAN DEFAULT NULL;
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_HOST_VALID BOOLEAN DEFAULT NULL;
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_DATE_VALID BOOLEAN DEFAULT NULL;
|
||||||
|
OPTIMIZE TABLE DOMAIN_SECURITY_INFORMATION;
|
@@ -0,0 +1,5 @@
|
|||||||
|
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||||
|
-- to make it easier to make sense of certificate changes
|
||||||
|
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NONE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
|
||||||
|
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,12 @@
|
|||||||
|
-- Table holding domains to be processed by the NDP in order to figure out whether to add them to
|
||||||
|
-- be crawled.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS NDP_NEW_DOMAINS(
|
||||||
|
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||||
|
STATE ENUM ('NEW', 'ACCEPTED', 'REJECTED') NOT NULL DEFAULT 'NEW',
|
||||||
|
PRIORITY INT NOT NULL DEFAULT 0,
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
CHECK_COUNT INT NOT NULL DEFAULT 0
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS NDP_NEW_DOMAINS__STATE_PRIORITY ON NDP_NEW_DOMAINS (STATE, PRIORITY DESC);
|
@@ -0,0 +1,3 @@
|
|||||||
|
-- Migration script to add AUTO_ASSIGN_DOMAINS column to NODE_CONFIGURATION table
|
||||||
|
|
||||||
|
ALTER TABLE NODE_CONFIGURATION ADD COLUMN AUTO_ASSIGN_DOMAINS BOOLEAN NOT NULL DEFAULT TRUE;
|
@@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.model;
|
||||||
|
|
||||||
|
public enum DocumentFormat {
|
||||||
|
PLAIN(0, 1, "text"),
|
||||||
|
PDF(0, 1, "pdf"),
|
||||||
|
UNKNOWN(0, 1, "???"),
|
||||||
|
HTML123(0, 1, "html"),
|
||||||
|
HTML4(-0.1, 1.05, "html"),
|
||||||
|
XHTML(-0.1, 1.05, "html"),
|
||||||
|
HTML5(0.5, 1.1, "html");
|
||||||
|
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double offset;
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double scale;
|
||||||
|
public final String shortFormat;
|
||||||
|
|
||||||
|
DocumentFormat(double offset, double scale, String shortFormat) {
|
||||||
|
this.offset = offset;
|
||||||
|
this.scale = scale;
|
||||||
|
this.shortFormat = shortFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
|
|||||||
@Nonnull
|
@Nonnull
|
||||||
public final String topDomain;
|
public final String topDomain;
|
||||||
|
|
||||||
public EdgeDomain(String host) {
|
public EdgeDomain(@Nonnull String host) {
|
||||||
Objects.requireNonNull(host, "domain name must not be null");
|
Objects.requireNonNull(host, "domain name must not be null");
|
||||||
|
|
||||||
host = host.toLowerCase();
|
host = host.toLowerCase();
|
||||||
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
|
|||||||
this.topDomain = topDomain;
|
this.topDomain = topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getTopDomain(String host) {
|
||||||
|
return new EdgeDomain(host).topDomain;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean looksLikeGovTld(String host) {
|
private boolean looksLikeGovTld(String host) {
|
||||||
if (host.length() < 8)
|
if (host.length() < 8)
|
||||||
return false;
|
return false;
|
||||||
@@ -108,32 +112,6 @@ public class EdgeDomain implements Serializable {
|
|||||||
return topDomain;
|
return topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDomainKey() {
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
return topDomain;
|
|
||||||
}
|
|
||||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLongDomainKey() {
|
|
||||||
StringBuilder ret = new StringBuilder();
|
|
||||||
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
ret.append(topDomain);
|
|
||||||
} else {
|
|
||||||
ret.append(topDomain, 0, cutPoint);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
|
|
||||||
ret.append(":");
|
|
||||||
ret.append(subDomain);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret.toString().toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** If possible, try to provide an alias domain,
|
/** If possible, try to provide an alias domain,
|
||||||
* i.e. a domain name that is very likely to link to this one
|
* i.e. a domain name that is very likely to link to this one
|
||||||
* */
|
* */
|
||||||
|
@@ -1,16 +1,14 @@
|
|||||||
package nu.marginalia.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
import nu.marginalia.util.QueryParams;
|
import nu.marginalia.util.QueryParams;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.net.MalformedURLException;
|
import java.net.*;
|
||||||
import java.net.URI;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
public class EdgeUrl implements Serializable {
|
public class EdgeUrl implements Serializable {
|
||||||
public final String proto;
|
public final String proto;
|
||||||
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {
|
|||||||
|
|
||||||
private static URI parseURI(String url) throws URISyntaxException {
|
private static URI parseURI(String url) throws URISyntaxException {
|
||||||
try {
|
try {
|
||||||
return new URI(urlencodeFixer(url));
|
return EdgeUriFactory.parseURILenient(url);
|
||||||
} catch (URISyntaxException ex) {
|
} catch (URISyntaxException ex) {
|
||||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||||
}
|
}
|
||||||
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
|
||||||
|
|
||||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
|
||||||
|
|
||||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
|
||||||
and what you get is more like what's on the inside, we try to patch things instead,
|
|
||||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
|
||||||
like bad or missing URLEncoding
|
|
||||||
*/
|
|
||||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
|
||||||
var s = new StringBuilder();
|
|
||||||
String goodChars = "&.?:/-;+$#";
|
|
||||||
String hexChars = "0123456789abcdefABCDEF";
|
|
||||||
|
|
||||||
int pathIdx = findPathIdx(url);
|
|
||||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
|
||||||
return url + "/";
|
|
||||||
}
|
|
||||||
s.append(url, 0, pathIdx);
|
|
||||||
|
|
||||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
|
||||||
int end = url.indexOf("#");
|
|
||||||
if (end < 0) end = url.length();
|
|
||||||
|
|
||||||
for (int i = pathIdx; i < end; i++) {
|
|
||||||
int c = url.charAt(i);
|
|
||||||
|
|
||||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
|
||||||
s.appendCodePoint(c);
|
|
||||||
} else if (c == '%' && i + 2 < end) {
|
|
||||||
int cn = url.charAt(i + 1);
|
|
||||||
int cnn = url.charAt(i + 2);
|
|
||||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
|
||||||
s.appendCodePoint(c);
|
|
||||||
} else {
|
|
||||||
s.append("%25");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
s.append(String.format("%%%02X", c));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return s.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int findPathIdx(String url) throws URISyntaxException {
|
|
||||||
int colonIdx = url.indexOf(':');
|
|
||||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
|
||||||
throw new URISyntaxException(url, "Lacking protocol");
|
|
||||||
}
|
|
||||||
return url.indexOf('/', colonIdx + 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
public EdgeUrl(URI URI) {
|
public EdgeUrl(URI URI) {
|
||||||
try {
|
try {
|
||||||
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
|
|||||||
sb.append(port);
|
sb.append(port);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EdgeUriFactory.urlencodePath(sb, path);
|
||||||
|
|
||||||
|
if (param != null) {
|
||||||
|
EdgeUriFactory.urlencodeQuery(sb, param);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toDisplayString() {
|
||||||
|
StringBuilder sb = new StringBuilder(256);
|
||||||
|
|
||||||
|
sb.append(proto);
|
||||||
|
sb.append("://");
|
||||||
|
sb.append(domain);
|
||||||
|
|
||||||
|
if (port != null) {
|
||||||
|
sb.append(':');
|
||||||
|
sb.append(port);
|
||||||
|
}
|
||||||
|
|
||||||
sb.append(path);
|
sb.append(path);
|
||||||
|
|
||||||
if (param != null) {
|
if (param != null) {
|
||||||
sb.append('?');
|
sb.append('?').append(param);
|
||||||
sb.append(param);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class EdgeUriFactory {
|
||||||
|
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||||
|
|
||||||
|
if (shouldOmitUrlencodeRepair(url)) {
|
||||||
|
try {
|
||||||
|
return new URI(url);
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex) {
|
||||||
|
// ignore and run the lenient parser
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var s = new StringBuilder(url.length()+8);
|
||||||
|
|
||||||
|
int pathIdx = findPathIdx(url);
|
||||||
|
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||||
|
return new URI(url + "/");
|
||||||
|
}
|
||||||
|
s.append(url, 0, pathIdx);
|
||||||
|
|
||||||
|
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||||
|
int end = url.indexOf("#");
|
||||||
|
if (end < 0) end = url.length();
|
||||||
|
|
||||||
|
int queryIdx = url.indexOf('?');
|
||||||
|
if (queryIdx < 0) queryIdx = end;
|
||||||
|
|
||||||
|
urlencodePath(s, url.substring(pathIdx, queryIdx));
|
||||||
|
if (queryIdx < end) {
|
||||||
|
urlencodeQuery(s, url.substring(queryIdx + 1, end));
|
||||||
|
}
|
||||||
|
return new URI(s.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Break apart the path element of an URI into its components, and then
|
||||||
|
* urlencode any component that needs it, and recombine it into a single
|
||||||
|
* path element again.
|
||||||
|
*/
|
||||||
|
public static void urlencodePath(StringBuilder sb, String path) {
|
||||||
|
if (path == null || path.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] pathParts = StringUtils.split(path, '/');
|
||||||
|
if (pathParts.length == 0) {
|
||||||
|
sb.append('/');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean shouldUrlEncode = false;
|
||||||
|
for (String pathPart : pathParts) {
|
||||||
|
if (pathPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (needsUrlEncode(pathPart)) {
|
||||||
|
shouldUrlEncode = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String pathPart : pathParts) {
|
||||||
|
if (pathPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (shouldUrlEncode) {
|
||||||
|
sb.append('/');
|
||||||
|
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||||
|
} else {
|
||||||
|
sb.append('/');
|
||||||
|
sb.append(pathPart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.endsWith("/")) {
|
||||||
|
sb.append('/');
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Break apart the query element of a URI into its components, and then
|
||||||
|
* urlencode any component that needs it, and recombine it into a single
|
||||||
|
* query element again.
|
||||||
|
*/
|
||||||
|
public static void urlencodeQuery(StringBuilder sb, String param) {
|
||||||
|
if (param == null || param.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] queryParts = StringUtils.split(param, '&');
|
||||||
|
|
||||||
|
boolean shouldUrlEncode = false;
|
||||||
|
for (String queryPart : queryParts) {
|
||||||
|
if (queryPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (needsUrlEncode(queryPart)) {
|
||||||
|
shouldUrlEncode = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean first = true;
|
||||||
|
for (String queryPart : queryParts) {
|
||||||
|
if (queryPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (first) {
|
||||||
|
sb.append('?');
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
sb.append('&');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shouldUrlEncode) {
|
||||||
|
int idx = queryPart.indexOf('=');
|
||||||
|
if (idx < 0) {
|
||||||
|
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||||
|
} else {
|
||||||
|
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
|
||||||
|
sb.append('=');
|
||||||
|
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sb.append(queryPart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test if the url element needs URL encoding.
|
||||||
|
* <p></p>
|
||||||
|
* Note we may have been given an already encoded path element,
|
||||||
|
* so we include % and + in the list of good characters
|
||||||
|
*/
|
||||||
|
static boolean needsUrlEncode(String urlElement) {
|
||||||
|
for (int i = 0; i < urlElement.length(); i++) {
|
||||||
|
char c = urlElement.charAt(i);
|
||||||
|
|
||||||
|
if (isUrlSafe(c)) continue;
|
||||||
|
if ("+".indexOf(c) >= 0) continue;
|
||||||
|
if (c == '%' && i + 2 < urlElement.length()) {
|
||||||
|
char c1 = urlElement.charAt(i + 1);
|
||||||
|
char c2 = urlElement.charAt(i + 2);
|
||||||
|
if (isHexDigit(c1) && isHexDigit(c2)) {
|
||||||
|
i += 2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static boolean isUrlSafe(int c) {
|
||||||
|
if (c >= 'a' && c <= 'z') return true;
|
||||||
|
if (c >= 'A' && c <= 'Z') return true;
|
||||||
|
if (c >= '0' && c <= '9') return true;
|
||||||
|
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test if the URL is a valid URL that does not need to be
|
||||||
|
* urlencoded.
|
||||||
|
* <p></p>
|
||||||
|
* This is a very simple heuristic test that does not guarantee
|
||||||
|
* that the URL is valid, but it will identify cases where we
|
||||||
|
* are fairly certain that the URL does not need encoding,
|
||||||
|
* so we can skip a bunch of allocations and string operations
|
||||||
|
* that would otherwise be needed to fix the URL.
|
||||||
|
*/
|
||||||
|
static boolean shouldOmitUrlencodeRepair(String url) {
|
||||||
|
int idx = 0;
|
||||||
|
final int len = url.length();
|
||||||
|
|
||||||
|
// Validate the scheme
|
||||||
|
while (idx < len - 2) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == ':') break;
|
||||||
|
if (!isAsciiAlphabetic(c)) return false;
|
||||||
|
}
|
||||||
|
if (url.charAt(idx++) != '/') return false;
|
||||||
|
if (url.charAt(idx++) != '/') return false;
|
||||||
|
|
||||||
|
// Validate the authority
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '/') break;
|
||||||
|
if (c == ':') continue;
|
||||||
|
if (c == '@') continue;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate the path
|
||||||
|
if (idx >= len) return true;
|
||||||
|
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '?') break;
|
||||||
|
if (c == '/') continue;
|
||||||
|
if (c == '#') return true;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx >= len) return true;
|
||||||
|
|
||||||
|
// Validate the query
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '&') continue;
|
||||||
|
if (c == '=') continue;
|
||||||
|
if (c == '#') return true;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isAsciiAlphabetic(int c) {
|
||||||
|
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isHexDigit(int c) {
|
||||||
|
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Find the index of the path element in a URL.
|
||||||
|
* <p></p>
|
||||||
|
* The path element starts after the scheme and authority part of the URL,
|
||||||
|
* which is everything up to and including the first slash after the colon.
|
||||||
|
*/
|
||||||
|
private static int findPathIdx(String url) throws URISyntaxException {
|
||||||
|
int colonIdx = url.indexOf(':');
|
||||||
|
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
||||||
|
throw new URISyntaxException(url, "Lacking scheme");
|
||||||
|
}
|
||||||
|
return url.indexOf('/', colonIdx + 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -28,6 +28,8 @@ public enum HtmlFeature {
|
|||||||
|
|
||||||
GA_SPAM("special:gaspam"),
|
GA_SPAM("special:gaspam"),
|
||||||
|
|
||||||
|
PDF("format:pdf"),
|
||||||
|
|
||||||
/** For fingerprinting and ranking */
|
/** For fingerprinting and ranking */
|
||||||
OPENGRAPH("special:opengraph"),
|
OPENGRAPH("special:opengraph"),
|
||||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||||
|
@@ -6,11 +6,20 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
public class GsonFactory {
|
public class GsonFactory {
|
||||||
public static Gson get() {
|
public static Gson get() {
|
||||||
return new GsonBuilder()
|
return new GsonBuilder()
|
||||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||||
|
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toEpochMilli()))
|
||||||
|
.registerTypeAdapter(Instant.class, (JsonDeserializer<Instant>) (json, typeOfT, context) -> {
|
||||||
|
if (json.isJsonPrimitive() && json.getAsJsonPrimitive().isNumber()) {
|
||||||
|
return Instant.ofEpochMilli(json.getAsLong());
|
||||||
|
} else {
|
||||||
|
throw new JsonParseException("Expected a number for Instant");
|
||||||
|
}
|
||||||
|
})
|
||||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||||
|
@@ -1,22 +0,0 @@
|
|||||||
package nu.marginalia.model.html;
|
|
||||||
|
|
||||||
// This class really doesn't belong anywhere, but will squat here for now
|
|
||||||
public enum HtmlStandard {
|
|
||||||
PLAIN(0, 1),
|
|
||||||
UNKNOWN(0, 1),
|
|
||||||
HTML123(0, 1),
|
|
||||||
HTML4(-0.1, 1.05),
|
|
||||||
XHTML(-0.1, 1.05),
|
|
||||||
HTML5(0.5, 1.1);
|
|
||||||
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double offset;
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double scale;
|
|
||||||
|
|
||||||
HtmlStandard(double offset, double scale) {
|
|
||||||
this.offset = offset;
|
|
||||||
this.scale = scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@@ -9,7 +9,7 @@ public enum DocumentFlags {
|
|||||||
GeneratorForum,
|
GeneratorForum,
|
||||||
GeneratorWiki,
|
GeneratorWiki,
|
||||||
Sideloaded,
|
Sideloaded,
|
||||||
Unused7,
|
PdfFile,
|
||||||
Unused8,
|
Unused8,
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@@ -83,6 +83,11 @@ public class QueryParams {
|
|||||||
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||||
return param.startsWith("project=") || param.startsWith("story=");
|
return param.startsWith("project=") || param.startsWith("story=");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// www.perseus.tufts.edu:
|
||||||
|
if (param.startsWith("collection=")) return true;
|
||||||
|
if (param.startsWith("doc=")) return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||||||
|
|
||||||
class EdgeDomainTest {
|
class EdgeDomainTest {
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSkepdic() throws URISyntaxException {
|
|
||||||
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
|
|
||||||
assertEquals("skepdic", domain.getDomain().getDomainKey());
|
|
||||||
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
|
|
||||||
assertEquals("skepdic", domain2.getDomain().getDomainKey());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHkDomain() throws URISyntaxException {
|
public void testHkDomain() throws URISyntaxException {
|
||||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
@@ -21,25 +21,70 @@ class EdgeUrlTest {
|
|||||||
new EdgeUrl("https://memex.marginalia.nu/#here")
|
new EdgeUrl("https://memex.marginalia.nu/#here")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testParam() throws URISyntaxException {
|
void testUriFromString() throws URISyntaxException {
|
||||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
|
||||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
// converting it back to a string, we want to ensure there is no changes along the way.
|
||||||
}
|
|
||||||
@Test
|
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
|
||||||
void urlencodeFixer() throws URISyntaxException {
|
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||||
|
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||||
|
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testParms() throws URISyntaxException {
|
void testParms() throws URISyntaxException {
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
|
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
|
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
|
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
|
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
|
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
|
||||||
|
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
|
||||||
|
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||||
|
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
|
||||||
|
|
||||||
|
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
|
||||||
|
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void progress(String step, int stepProgress, int stepCount) {
|
public void progress(String step, int stepProgress, int stepCount) {
|
||||||
|
int lastProgress = this.progress;
|
||||||
this.step = step;
|
this.step = step;
|
||||||
|
|
||||||
|
|
||||||
// off by one since we calculate the progress based on the number of steps,
|
|
||||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
|
||||||
// final progress being 80% and not 100%)
|
|
||||||
|
|
||||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||||
|
|
||||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
if (this.progress / 10 != lastProgress / 10) {
|
||||||
|
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||||
|
@@ -0,0 +1,59 @@
|
|||||||
|
package nu.marginalia.process.control;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class ProcessEventLog {
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(ProcessEventLog.class);
|
||||||
|
|
||||||
|
private final String serviceName;
|
||||||
|
private final UUID instanceUuid;
|
||||||
|
private final String serviceBase;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ProcessEventLog(HikariDataSource dataSource, ProcessConfiguration configuration) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
|
||||||
|
this.serviceName = configuration.processName() + ":" + configuration.node();
|
||||||
|
this.instanceUuid = configuration.instanceUuid();
|
||||||
|
this.serviceBase = configuration.processName();
|
||||||
|
|
||||||
|
logger.info("Starting service {} instance {}", serviceName, instanceUuid);
|
||||||
|
|
||||||
|
logEvent("PCS-START", serviceName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void logEvent(Class<?> type, String message) {
|
||||||
|
logEvent(type.getSimpleName(), message);
|
||||||
|
}
|
||||||
|
public void logEvent(String type, String message) {
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, serviceName);
|
||||||
|
stmt.setString(2, serviceBase);
|
||||||
|
stmt.setString(3, instanceUuid.toString());
|
||||||
|
stmt.setString(4, type);
|
||||||
|
stmt.setString(5, Objects.requireNonNull(message, ""));
|
||||||
|
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to log event {}:{}", type, message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.util.*;
|
import java.util.HashSet;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
/** WorkLog is a journal of work done by a process,
|
/** WorkLog is a journal of work done by a process,
|
||||||
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
|
|||||||
return new WorkLoadIterable<>(logFile, mapper);
|
return new WorkLoadIterable<>(logFile, mapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int countEntries(Path crawlerLog) throws IOException{
|
||||||
|
try (var linesStream = Files.lines(crawlerLog)) {
|
||||||
|
return (int) linesStream.filter(WorkLogEntry::isJobId).count();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Use synchro over concurrent set to avoid competing writes
|
// Use synchro over concurrent set to avoid competing writes
|
||||||
// - correct is better than fast here, it's sketchy enough to use
|
// - correct is better than fast here, it's sketchy enough to use
|
||||||
// a PrintWriter
|
// a PrintWriter
|
||||||
|
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void progress(String step, int stepProgress, int stepCount) {
|
public void progress(String step, int stepProgress, int stepCount) {
|
||||||
|
int lastProgress = this.progress;
|
||||||
this.step = step;
|
this.step = step;
|
||||||
|
|
||||||
|
|
||||||
// off by one since we calculate the progress based on the number of steps,
|
|
||||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
|
||||||
// final progress being 80% and not 100%)
|
|
||||||
|
|
||||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||||
|
|
||||||
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
|
if (this.progress / 10 != lastProgress / 10) {
|
||||||
|
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void shutDown() {
|
public void shutDown() {
|
||||||
|
@@ -1,17 +1,21 @@
|
|||||||
package nu.marginalia.service.discovery;
|
package nu.marginalia.service.discovery;
|
||||||
|
|
||||||
import nu.marginalia.service.discovery.monitor.*;
|
import com.google.inject.ImplementedBy;
|
||||||
|
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||||
|
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
|
||||||
|
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
|
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||||
|
|
||||||
/** A service registry that allows services to register themselves and
|
/** A service registry that allows services to register themselves and
|
||||||
* be discovered by other services on the network.
|
* be discovered by other services on the network.
|
||||||
*/
|
*/
|
||||||
|
@ImplementedBy(ZkServiceRegistry.class)
|
||||||
public interface ServiceRegistryIf {
|
public interface ServiceRegistryIf {
|
||||||
/**
|
/**
|
||||||
* Register a service with the registry.
|
* Register a service with the registry.
|
||||||
@@ -57,4 +61,9 @@ public interface ServiceRegistryIf {
|
|||||||
* </ul>
|
* </ul>
|
||||||
* */
|
* */
|
||||||
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
||||||
|
|
||||||
|
void registerProcess(String processName, int nodeId);
|
||||||
|
void deregisterProcess(String processName, int nodeId);
|
||||||
|
|
||||||
|
InterProcessSemaphoreV2 getSemaphore(String name, int permits) throws Exception;
|
||||||
}
|
}
|
||||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
|||||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
import org.apache.curator.framework.CuratorFramework;
|
import org.apache.curator.framework.CuratorFramework;
|
||||||
|
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||||
import org.apache.curator.utils.ZKPaths;
|
import org.apache.curator.utils.ZKPaths;
|
||||||
import org.apache.zookeeper.CreateMode;
|
import org.apache.zookeeper.CreateMode;
|
||||||
import org.apache.zookeeper.Watcher;
|
import org.apache.zookeeper.Watcher;
|
||||||
@@ -256,6 +257,42 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
|||||||
.forPath("/running-instances");
|
.forPath("/running-instances");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void registerProcess(String processName, int nodeId) {
|
||||||
|
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||||
|
try {
|
||||||
|
curatorFramework.create()
|
||||||
|
.creatingParentsIfNeeded()
|
||||||
|
.withMode(CreateMode.EPHEMERAL)
|
||||||
|
.forPath(path);
|
||||||
|
livenessPaths.add(path);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Failed to register process {} on node {}", processName, nodeId, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void deregisterProcess(String processName, int nodeId) {
|
||||||
|
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||||
|
try {
|
||||||
|
curatorFramework.delete().forPath(path);
|
||||||
|
livenessPaths.remove(path);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Failed to deregister process {} on node {}", processName, nodeId, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InterProcessSemaphoreV2 getSemaphore(String name, int permits) {
|
||||||
|
if (stopped)
|
||||||
|
throw new IllegalStateException("Service registry is stopped, cannot get semaphore " + name);
|
||||||
|
|
||||||
|
String path = "/semaphores/" + name;
|
||||||
|
return new InterProcessSemaphoreV2(curatorFramework, path, permits);
|
||||||
|
}
|
||||||
|
|
||||||
/* Exposed for tests */
|
/* Exposed for tests */
|
||||||
public synchronized void shutDown() {
|
public synchronized void shutDown() {
|
||||||
if (stopped)
|
if (stopped)
|
||||||
|
@@ -89,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
|
|||||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||||
|
|
||||||
config.setMaximumPoolSize(5);
|
config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
|
||||||
config.setMinimumIdle(2);
|
config.setMinimumIdle(2);
|
||||||
|
|
||||||
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
||||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.NetworkInterface;
|
import java.net.NetworkInterface;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
@@ -115,11 +116,12 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String getLocalNetworkIP() throws Exception {
|
public static String getLocalNetworkIP() throws IOException {
|
||||||
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
||||||
|
|
||||||
while (nets.hasMoreElements()) {
|
while (nets.hasMoreElements()) {
|
||||||
NetworkInterface netif = nets.nextElement();
|
NetworkInterface netif = nets.nextElement();
|
||||||
|
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
|
||||||
if (!netif.isUp() || netif.isLoopback()) {
|
if (!netif.isUp() || netif.isLoopback()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -127,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||||
while (inetAddresses.hasMoreElements()) {
|
while (inetAddresses.hasMoreElements()) {
|
||||||
InetAddress addr = inetAddresses.nextElement();
|
InetAddress addr = inetAddresses.nextElement();
|
||||||
|
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
|
||||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||||
return addr.getHostAddress();
|
return addr.getHostAddress();
|
||||||
}
|
}
|
||||||
|
@@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
import org.slf4j.Marker;
|
import org.slf4j.Marker;
|
||||||
import org.slf4j.MarkerFactory;
|
import org.slf4j.MarkerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -106,9 +107,12 @@ public class JoobyService {
|
|||||||
config.externalAddress());
|
config.externalAddress());
|
||||||
|
|
||||||
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
||||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
|
||||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||||
|
}
|
||||||
|
if (Files.exists(Path.of("/app/resources/static"))) {
|
||||||
|
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||||
|
}
|
||||||
var options = new ServerOptions();
|
var options = new ServerOptions();
|
||||||
options.setHost(config.bindAddress());
|
options.setHost(config.bindAddress());
|
||||||
options.setPort(restEndpoint.port());
|
options.setPort(restEndpoint.port());
|
||||||
@@ -118,6 +122,11 @@ public class JoobyService {
|
|||||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||||
options.setCompressionLevel(1);
|
options.setCompressionLevel(1);
|
||||||
|
|
||||||
|
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||||
|
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||||
|
// scenario
|
||||||
|
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||||
|
|
||||||
|
|
||||||
jooby.setServerOptions(options);
|
jooby.setServerOptions(options);
|
||||||
|
|
||||||
|
@@ -6,25 +6,36 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
|||||||
import org.eclipse.jetty.server.Server;
|
import org.eclipse.jetty.server.Server;
|
||||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||||
import org.eclipse.jetty.servlet.ServletHolder;
|
import org.eclipse.jetty.servlet.ServletHolder;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
|
|
||||||
public class MetricsServer {
|
public class MetricsServer {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public MetricsServer(ServiceConfiguration configuration) throws Exception {
|
public MetricsServer(ServiceConfiguration configuration) {
|
||||||
// If less than zero, we forego setting up a metrics server
|
// If less than zero, we forego setting up a metrics server
|
||||||
if (configuration.metricsPort() < 0)
|
if (configuration.metricsPort() < 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
try {
|
||||||
|
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||||
|
|
||||||
ServletContextHandler context = new ServletContextHandler();
|
ServletContextHandler context = new ServletContextHandler();
|
||||||
context.setContextPath("/");
|
context.setContextPath("/");
|
||||||
server.setHandler(context);
|
server.setHandler(context);
|
||||||
|
|
||||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||||
|
|
||||||
server.start();
|
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||||
|
|
||||||
|
server.start();
|
||||||
|
}
|
||||||
|
catch (Exception|NoSuchMethodError ex) {
|
||||||
|
logger.error("Failed to set up metrics server", ex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -35,21 +35,8 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forExpensiveRequest() {
|
|
||||||
return new RateLimiter(5, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter custom(int perMinute) {
|
public static RateLimiter custom(int perMinute) {
|
||||||
return new RateLimiter(perMinute, 60);
|
return new RateLimiter(4 * perMinute, perMinute);
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter forSpamBots() {
|
|
||||||
return new RateLimiter(120, 3600);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forLogin() {
|
|
||||||
return new RateLimiter(3, 15);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void cleanIdleBuckets() {
|
private void cleanIdleBuckets() {
|
||||||
@@ -62,7 +49,7 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Bucket createBucket() {
|
private Bucket createBucket() {
|
||||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
|
||||||
var bw = Bandwidth.classic(capacity, refill);
|
var bw = Bandwidth.classic(capacity, refill);
|
||||||
return Bucket.builder().addLimit(bw).build();
|
return Bucket.builder().addLimit(bw).build();
|
||||||
}
|
}
|
||||||
|
@@ -3,8 +3,17 @@
|
|||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="Console" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||||
<Filters>
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -13,15 +22,40 @@
|
|||||||
<Filters>
|
<Filters>
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
<SizeBasedTriggeringPolicy size="10MB" />
|
<SizeBasedTriggeringPolicy size="10MB" />
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="Console"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="LogToFile"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
|
@@ -1,10 +1,53 @@
|
|||||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||||
<Appenders>
|
<Appenders>
|
||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
<Filters>
|
<Filters>
|
||||||
|
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -17,14 +60,41 @@
|
|||||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
<AppenderRef ref="ConsoleError"/>
|
||||||
|
<AppenderRef ref="ConsoleFatal"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="LogToFile"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
|
@@ -1,15 +1,50 @@
|
|||||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||||
<Appenders>
|
<Appenders>
|
||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
<AppenderRef ref="ConsoleError"/>
|
||||||
|
<AppenderRef ref="ConsoleFatal"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
</Configuration>
|
</Configuration>
|
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
|
|||||||
class ZkServiceRegistryTest {
|
class ZkServiceRegistryTest {
|
||||||
private static final int ZOOKEEPER_PORT = 2181;
|
private static final int ZOOKEEPER_PORT = 2181;
|
||||||
private static final GenericContainer<?> zookeeper =
|
private static final GenericContainer<?> zookeeper =
|
||||||
new GenericContainer<>("zookeeper:3.8.0")
|
new GenericContainer<>("zookeeper:3.8")
|
||||||
.withExposedPorts(ZOOKEEPER_PORT);
|
.withExposedPorts(ZOOKEEPER_PORT);
|
||||||
|
|
||||||
List<ZkServiceRegistry> registries = new ArrayList<>();
|
List<ZkServiceRegistry> registries = new ArrayList<>();
|
||||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.executor.storage.FileStorageFile;
|
|||||||
import nu.marginalia.executor.upload.UploadDirContents;
|
import nu.marginalia.executor.upload.UploadDirContents;
|
||||||
import nu.marginalia.executor.upload.UploadDirItem;
|
import nu.marginalia.executor.upload.UploadDirItem;
|
||||||
import nu.marginalia.functions.execution.api.*;
|
import nu.marginalia.functions.execution.api.*;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.service.ServiceId;
|
import nu.marginalia.service.ServiceId;
|
||||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||||
@@ -25,27 +26,37 @@ import java.net.URISyntaxException;
|
|||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.Duration;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ExecutorClient {
|
public class ExecutorClient {
|
||||||
|
private final MqPersistence persistence;
|
||||||
private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
|
private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
|
||||||
private final ServiceRegistryIf registry;
|
private final ServiceRegistryIf registry;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ExecutorClient(ServiceRegistryIf registry,
|
public ExecutorClient(ServiceRegistryIf registry,
|
||||||
|
MqPersistence persistence,
|
||||||
GrpcChannelPoolFactory grpcChannelPoolFactory)
|
GrpcChannelPoolFactory grpcChannelPoolFactory)
|
||||||
{
|
{
|
||||||
this.registry = registry;
|
this.registry = registry;
|
||||||
|
this.persistence = persistence;
|
||||||
this.channelPool = grpcChannelPoolFactory
|
this.channelPool = grpcChannelPoolFactory
|
||||||
.createMulti(
|
.createMulti(
|
||||||
ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
|
ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
|
||||||
ExecutorApiGrpc::newBlockingStub);
|
ExecutorApiGrpc::newBlockingStub);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
|
||||||
|
return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void startFsm(int node, String actorName) {
|
public void startFsm(int node, String actorName) {
|
||||||
channelPool.call(ExecutorApiBlockingStub::startFsm)
|
channelPool.call(ExecutorApiBlockingStub::startFsm)
|
||||||
.forNode(node)
|
.forNode(node)
|
||||||
@@ -96,6 +107,16 @@ public class ExecutorClient {
|
|||||||
.build());
|
.build());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long updateNsfwFilters() throws Exception {
|
||||||
|
long msgId = createTrackingTokenMsg("nsfw-filters", 1, Duration.ofHours(6));
|
||||||
|
|
||||||
|
channelPool.call(ExecutorApiBlockingStub::updateNsfwFilters)
|
||||||
|
.forNode(1)
|
||||||
|
.run(RpcUpdateNsfwFilters.newBuilder().setMsgId(msgId).build());
|
||||||
|
|
||||||
|
return msgId;
|
||||||
|
}
|
||||||
|
|
||||||
public ActorRunStates getActorStates(int node) {
|
public ActorRunStates getActorStates(int node) {
|
||||||
try {
|
try {
|
||||||
var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
|
var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
|
||||||
|
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
|
|||||||
return msgId;
|
return msgId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
|
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
|
||||||
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
||||||
.forNode(node)
|
.forNode(node)
|
||||||
.run(RpcExportSampleData.newBuilder()
|
.run(RpcExportSampleData.newBuilder()
|
||||||
.setFileStorageId(fid.id())
|
.setFileStorageId(fid.id())
|
||||||
.setSize(size)
|
.setSize(size)
|
||||||
|
.setCtFilter(ctFilter)
|
||||||
.setName(name)
|
.setName(name)
|
||||||
.build());
|
.build());
|
||||||
}
|
}
|
||||||
|
@@ -18,6 +18,8 @@ service ExecutorApi {
|
|||||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||||
|
|
||||||
|
rpc updateNsfwFilters(RpcUpdateNsfwFilters) returns (Empty) {}
|
||||||
|
|
||||||
rpc restartExecutorService(Empty) returns (Empty) {}
|
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,6 +68,9 @@ message RpcExportRequest {
|
|||||||
int64 fileStorageId = 1;
|
int64 fileStorageId = 1;
|
||||||
int64 msgId = 2;
|
int64 msgId = 2;
|
||||||
}
|
}
|
||||||
|
message RpcUpdateNsfwFilters {
|
||||||
|
int64 msgId = 1;
|
||||||
|
}
|
||||||
message RpcFileStorageIdWithDomainName {
|
message RpcFileStorageIdWithDomainName {
|
||||||
int64 fileStorageId = 1;
|
int64 fileStorageId = 1;
|
||||||
string targetDomainName = 2;
|
string targetDomainName = 2;
|
||||||
@@ -100,6 +105,7 @@ message RpcExportSampleData {
|
|||||||
int64 fileStorageId = 1;
|
int64 fileStorageId = 1;
|
||||||
int32 size = 2;
|
int32 size = 2;
|
||||||
string name = 3;
|
string name = 3;
|
||||||
|
string ctFilter = 4;
|
||||||
}
|
}
|
||||||
message RpcDownloadSampleData {
|
message RpcDownloadSampleData {
|
||||||
string sampleSet = 1;
|
string sampleSet = 1;
|
||||||
|
@@ -19,6 +19,8 @@ dependencies {
|
|||||||
implementation project(':code:processes:crawling-process')
|
implementation project(':code:processes:crawling-process')
|
||||||
implementation project(':code:processes:live-crawling-process')
|
implementation project(':code:processes:live-crawling-process')
|
||||||
implementation project(':code:processes:loading-process')
|
implementation project(':code:processes:loading-process')
|
||||||
|
implementation project(':code:processes:ping-process')
|
||||||
|
implementation project(':code:processes:new-domain-process')
|
||||||
implementation project(':code:processes:converting-process')
|
implementation project(':code:processes:converting-process')
|
||||||
implementation project(':code:processes:index-constructor-process')
|
implementation project(':code:processes:index-constructor-process')
|
||||||
|
|
||||||
@@ -37,9 +39,9 @@ dependencies {
|
|||||||
implementation project(':code:functions:link-graph:api')
|
implementation project(':code:functions:link-graph:api')
|
||||||
implementation project(':code:functions:live-capture:api')
|
implementation project(':code:functions:live-capture:api')
|
||||||
implementation project(':code:functions:search-query')
|
implementation project(':code:functions:search-query')
|
||||||
|
implementation project(':code:functions:nsfw-domain-filter')
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
|
|
||||||
implementation project(':code:processes:crawling-process:model')
|
|
||||||
implementation project(':code:processes:crawling-process:model')
|
implementation project(':code:processes:crawling-process:model')
|
||||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||||
implementation project(':code:index:index-journal')
|
implementation project(':code:index:index-journal')
|
||||||
|
@@ -2,10 +2,11 @@ package nu.marginalia.actor;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.functions.execution.api.*;
|
import nu.marginalia.functions.execution.api.RpcFsmName;
|
||||||
|
import nu.marginalia.functions.execution.api.RpcProcessId;
|
||||||
import nu.marginalia.mq.MqMessageState;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -14,18 +15,18 @@ import spark.Spark;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class ActorApi {
|
public class ActorApi {
|
||||||
private final ExecutorActorControlService actors;
|
private final ExecutorActorControlService actors;
|
||||||
private final ProcessService processService;
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
private final MqPersistence mqPersistence;
|
private final MqPersistence mqPersistence;
|
||||||
private final ServiceConfiguration serviceConfiguration;
|
private final ServiceConfiguration serviceConfiguration;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
@Inject
|
@Inject
|
||||||
public ActorApi(ExecutorActorControlService actors,
|
public ActorApi(ExecutorActorControlService actors,
|
||||||
ProcessService processService,
|
ProcessSpawnerService processSpawnerService,
|
||||||
MqPersistence mqPersistence,
|
MqPersistence mqPersistence,
|
||||||
ServiceConfiguration serviceConfiguration)
|
ServiceConfiguration serviceConfiguration)
|
||||||
{
|
{
|
||||||
this.actors = actors;
|
this.actors = actors;
|
||||||
this.processService = processService;
|
this.processSpawnerService = processSpawnerService;
|
||||||
this.mqPersistence = mqPersistence;
|
this.mqPersistence = mqPersistence;
|
||||||
this.serviceConfiguration = serviceConfiguration;
|
this.serviceConfiguration = serviceConfiguration;
|
||||||
}
|
}
|
||||||
@@ -43,7 +44,7 @@ public class ActorApi {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Object stopProcess(RpcProcessId processId) {
|
public Object stopProcess(RpcProcessId processId) {
|
||||||
ProcessService.ProcessId id = ProcessService.translateExternalIdBase(processId.getProcessId());
|
ProcessSpawnerService.ProcessId id = ProcessSpawnerService.translateExternalIdBase(processId.getProcessId());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String inbox = id.name().toLowerCase() + ":" + serviceConfiguration.node();
|
String inbox = id.name().toLowerCase() + ":" + serviceConfiguration.node();
|
||||||
@@ -60,7 +61,7 @@ public class ActorApi {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
processService.kill(id);
|
processSpawnerService.kill(id);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Failed to stop process {}", id, ex);
|
logger.error("Failed to stop process {}", id, ex);
|
||||||
|
@@ -6,12 +6,15 @@ import java.util.Set;
|
|||||||
|
|
||||||
public enum ExecutorActor {
|
public enum ExecutorActor {
|
||||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
UPDATE_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD, NodeProfile.REALTIME),
|
||||||
|
|
||||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
PROC_NDP_SPAWNER(NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
@@ -20,6 +23,7 @@ public enum ExecutorActor {
|
|||||||
EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
MIGRATE_CRAWL_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
|
||||||
PROC_CONVERTER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
PROC_CONVERTER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
PROC_LOADER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
PROC_LOADER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
@@ -34,7 +38,8 @@ public enum ExecutorActor {
|
|||||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||||
UPDATE_RSS(NodeProfile.REALTIME);
|
UPDATE_RSS(NodeProfile.REALTIME)
|
||||||
|
;
|
||||||
|
|
||||||
public String id() {
|
public String id() {
|
||||||
return "fsm:" + name().toLowerCase();
|
return "fsm:" + name().toLowerCase();
|
||||||
|
@@ -49,6 +49,8 @@ public class ExecutorActorControlService {
|
|||||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||||
RestoreBackupActor restoreBackupActor,
|
RestoreBackupActor restoreBackupActor,
|
||||||
ConverterMonitorActor converterMonitorFSM,
|
ConverterMonitorActor converterMonitorFSM,
|
||||||
|
NdpMonitorActor ndpMonitorActor,
|
||||||
|
PingMonitorActor pingMonitorActor,
|
||||||
CrawlerMonitorActor crawlerMonitorActor,
|
CrawlerMonitorActor crawlerMonitorActor,
|
||||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||||
LoaderMonitorActor loaderMonitor,
|
LoaderMonitorActor loaderMonitor,
|
||||||
@@ -66,7 +68,9 @@ public class ExecutorActorControlService {
|
|||||||
DownloadSampleActor downloadSampleActor,
|
DownloadSampleActor downloadSampleActor,
|
||||||
ScrapeFeedsActor scrapeFeedsActor,
|
ScrapeFeedsActor scrapeFeedsActor,
|
||||||
ExecutorActorStateMachines stateMachines,
|
ExecutorActorStateMachines stateMachines,
|
||||||
|
MigrateCrawlDataActor migrateCrawlDataActor,
|
||||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||||
|
UpdateNsfwFiltersActor updateNsfwFiltersActor,
|
||||||
UpdateRssActor updateRssActor) throws SQLException {
|
UpdateRssActor updateRssActor) throws SQLException {
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
this.eventLog = baseServiceParams.eventLog;
|
this.eventLog = baseServiceParams.eventLog;
|
||||||
@@ -87,9 +91,10 @@ public class ExecutorActorControlService {
|
|||||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||||
|
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||||
|
register(ExecutorActor.PROC_NDP_SPAWNER, ndpMonitorActor);
|
||||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||||
|
|
||||||
@@ -107,6 +112,9 @@ public class ExecutorActorControlService {
|
|||||||
register(ExecutorActor.SCRAPE_FEEDS, scrapeFeedsActor);
|
register(ExecutorActor.SCRAPE_FEEDS, scrapeFeedsActor);
|
||||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||||
|
|
||||||
|
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||||
|
register(ExecutorActor.UPDATE_NSFW_LISTS, updateNsfwFiltersActor);
|
||||||
|
|
||||||
if (serviceConfiguration.node() == 1) {
|
if (serviceConfiguration.node() == 1) {
|
||||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||||
}
|
}
|
||||||
|
@@ -4,11 +4,14 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.*;
|
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.actor.state.Resume;
|
||||||
|
import nu.marginalia.actor.state.Terminal;
|
||||||
import nu.marginalia.mq.MqMessageState;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -24,13 +27,13 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
|||||||
public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||||
|
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
private final ProcessService processService;
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public static final int MAX_ATTEMPTS = 3;
|
public static final int MAX_ATTEMPTS = 3;
|
||||||
private final String inboxName;
|
private final String inboxName;
|
||||||
private final ProcessService.ProcessId processId;
|
private final ProcessSpawnerService.ProcessId processId;
|
||||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||||
private final int node;
|
private final int node;
|
||||||
|
|
||||||
@@ -50,7 +53,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
|||||||
for (;;) {
|
for (;;) {
|
||||||
var messages = persistence.eavesdrop(inboxName, 1);
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
if (messages.isEmpty() && !processSpawnerService.isRunning(processId)) {
|
||||||
synchronized (processId) {
|
synchronized (processId) {
|
||||||
processId.wait(5000);
|
processId.wait(5000);
|
||||||
}
|
}
|
||||||
@@ -92,7 +95,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
|||||||
catch (InterruptedException ex) {
|
catch (InterruptedException ex) {
|
||||||
// We get this exception when the process is cancelled by the user
|
// We get this exception when the process is cancelled by the user
|
||||||
|
|
||||||
processService.kill(processId);
|
processSpawnerService.kill(processId);
|
||||||
setCurrentMessageToDead();
|
setCurrentMessageToDead();
|
||||||
|
|
||||||
yield new Aborted();
|
yield new Aborted();
|
||||||
@@ -112,13 +115,13 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
|||||||
public AbstractProcessSpawnerActor(Gson gson,
|
public AbstractProcessSpawnerActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService,
|
ProcessSpawnerService processSpawnerService,
|
||||||
String inboxName,
|
String inboxName,
|
||||||
ProcessService.ProcessId processId) {
|
ProcessSpawnerService.ProcessId processId) {
|
||||||
super(gson);
|
super(gson);
|
||||||
this.node = configuration.node();
|
this.node = configuration.node();
|
||||||
this.persistence = persistence;
|
this.persistence = persistence;
|
||||||
this.processService = processService;
|
this.processSpawnerService = processSpawnerService;
|
||||||
this.inboxName = inboxName + ":" + node;
|
this.inboxName = inboxName + ":" + node;
|
||||||
this.processId = processId;
|
this.processId = processId;
|
||||||
}
|
}
|
||||||
@@ -149,7 +152,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
|||||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||||
executorService.submit(() -> {
|
executorService.submit(() -> {
|
||||||
try {
|
try {
|
||||||
processService.trigger(processId);
|
processSpawnerService.trigger(processId);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("Error in triggering process", e);
|
logger.warn("Error in triggering process", e);
|
||||||
error.set(true);
|
error.set(true);
|
||||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -17,13 +17,13 @@ public class ConverterMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public ConverterMonitorActor(Gson gson,
|
public ConverterMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.CONVERTER_INBOX,
|
ProcessInboxNames.CONVERTER_INBOX,
|
||||||
ProcessService.ProcessId.CONVERTER);
|
ProcessSpawnerService.ProcessId.CONVERTER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -16,13 +16,13 @@ public class CrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public CrawlerMonitorActor(Gson gson,
|
public CrawlerMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.CRAWLER_INBOX,
|
ProcessInboxNames.CRAWLER_INBOX,
|
||||||
ProcessService.ProcessId.CRAWLER);
|
ProcessSpawnerService.ProcessId.CRAWLER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
|||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -16,13 +16,13 @@ public class ExportTaskMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public ExportTaskMonitorActor(Gson gson,
|
public ExportTaskMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.EXPORT_TASK_INBOX,
|
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||||
ProcessService.ProcessId.EXPORT_TASKS);
|
ProcessSpawnerService.ProcessId.EXPORT_TASKS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -17,13 +17,13 @@ public class IndexConstructorMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public IndexConstructorMonitorActor(Gson gson,
|
public IndexConstructorMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX,
|
ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX,
|
||||||
ProcessService.ProcessId.INDEX_CONSTRUCTOR);
|
ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
|||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -16,13 +16,13 @@ public class LiveCrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public LiveCrawlerMonitorActor(Gson gson,
|
public LiveCrawlerMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||||
ProcessService.ProcessId.LIVE_CRAWLER);
|
ProcessSpawnerService.ProcessId.LIVE_CRAWLER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -17,13 +17,13 @@ public class LoaderMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public LoaderMonitorActor(Gson gson,
|
public LoaderMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
|
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence, processService,
|
persistence, processSpawnerService,
|
||||||
ProcessInboxNames.LOADER_INBOX,
|
ProcessInboxNames.LOADER_INBOX,
|
||||||
ProcessService.ProcessId.LOADER);
|
ProcessSpawnerService.ProcessId.LOADER);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,29 @@
|
|||||||
|
package nu.marginalia.actor.proc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class NdpMonitorActor extends AbstractProcessSpawnerActor {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public NdpMonitorActor(Gson gson,
|
||||||
|
ServiceConfiguration configuration,
|
||||||
|
MqPersistence persistence,
|
||||||
|
ProcessSpawnerService processSpawnerService) {
|
||||||
|
super(gson,
|
||||||
|
configuration,
|
||||||
|
persistence,
|
||||||
|
processSpawnerService,
|
||||||
|
ProcessInboxNames.NDP_INBOX,
|
||||||
|
ProcessSpawnerService.ProcessId.NDP);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,181 @@
|
|||||||
|
package nu.marginalia.actor.proc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.actor.state.Resume;
|
||||||
|
import nu.marginalia.actor.state.Terminal;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.mqapi.ping.PingRequest;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
|
|
||||||
|
// Unlike other monitor actors, the ping monitor will not merely wait for a request
|
||||||
|
// to be sent, but send one itself, hence we can't extend AbstractProcessSpawnerActor
|
||||||
|
// but have to reimplement a lot of the same logic ourselves.
|
||||||
|
@Singleton
|
||||||
|
public class PingMonitorActor extends RecordActorPrototype {
|
||||||
|
|
||||||
|
private final MqPersistence persistence;
|
||||||
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
public static final int MAX_ATTEMPTS = 3;
|
||||||
|
private final String inboxName;
|
||||||
|
private final ProcessSpawnerService.ProcessId processId;
|
||||||
|
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||||
|
private final int node;
|
||||||
|
private final Gson gson;
|
||||||
|
|
||||||
|
public record Initial() implements ActorStep {}
|
||||||
|
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||||
|
public record Monitor(int errorAttempts) implements ActorStep {}
|
||||||
|
@Resume(behavior = ActorResumeBehavior.RESTART)
|
||||||
|
public record Run(int attempts) implements ActorStep {}
|
||||||
|
@Terminal
|
||||||
|
public record Aborted() implements ActorStep {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
return switch (self) {
|
||||||
|
case Initial i -> {
|
||||||
|
PingRequest request = new PingRequest();
|
||||||
|
persistence.sendNewMessage(inboxName, null, null,
|
||||||
|
"PingRequest",
|
||||||
|
gson.toJson(request),
|
||||||
|
null);
|
||||||
|
|
||||||
|
yield new Monitor(0);
|
||||||
|
}
|
||||||
|
case Monitor(int errorAttempts) -> {
|
||||||
|
for (;;) {
|
||||||
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
|
if (messages.isEmpty() && !processSpawnerService.isRunning(processId)) {
|
||||||
|
synchronized (processId) {
|
||||||
|
processId.wait(5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||||
|
yield new Monitor(0);
|
||||||
|
}
|
||||||
|
// else continue
|
||||||
|
} else {
|
||||||
|
// Special: Associate this thread with the message so that we can get tracking
|
||||||
|
MqMessageHandlerRegistry.register(messages.getFirst().msgId());
|
||||||
|
|
||||||
|
yield new Run(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case Run(int attempts) -> {
|
||||||
|
try {
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
var exec = new TaskExecution();
|
||||||
|
long endTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
if (exec.isError()) {
|
||||||
|
if (attempts < MAX_ATTEMPTS)
|
||||||
|
yield new Run(attempts + 1);
|
||||||
|
else
|
||||||
|
yield new Error();
|
||||||
|
}
|
||||||
|
else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) {
|
||||||
|
// To avoid boot loops, we transition to error if the process
|
||||||
|
// didn't run for longer than 1 seconds. This might happen if
|
||||||
|
// the process crashes before it can reach the heartbeat and inbox
|
||||||
|
// stages of execution. In this case it would not report having acted
|
||||||
|
// on its message, and the process would be restarted forever without
|
||||||
|
// the attempts counter incrementing.
|
||||||
|
yield new Error("Process terminated within 1 seconds of starting");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (InterruptedException ex) {
|
||||||
|
// We get this exception when the process is cancelled by the user
|
||||||
|
|
||||||
|
processSpawnerService.kill(processId);
|
||||||
|
setCurrentMessageToDead();
|
||||||
|
|
||||||
|
yield new Aborted();
|
||||||
|
}
|
||||||
|
|
||||||
|
yield new Monitor(attempts);
|
||||||
|
}
|
||||||
|
default -> new Error();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public String describe() {
|
||||||
|
return "Spawns a(n) " + processId + " process and monitors its inbox for messages";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public PingMonitorActor(Gson gson,
|
||||||
|
ServiceConfiguration configuration,
|
||||||
|
MqPersistence persistence,
|
||||||
|
ProcessSpawnerService processSpawnerService) throws SQLException {
|
||||||
|
super(gson);
|
||||||
|
this.gson = gson;
|
||||||
|
this.node = configuration.node();
|
||||||
|
this.persistence = persistence;
|
||||||
|
this.processSpawnerService = processSpawnerService;
|
||||||
|
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||||
|
this.processId = ProcessSpawnerService.ProcessId.PING;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Sets the message to dead in the database to avoid
|
||||||
|
* the service respawning on the same task when we
|
||||||
|
* re-enable this actor */
|
||||||
|
private void setCurrentMessageToDead() {
|
||||||
|
try {
|
||||||
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
|
if (messages.isEmpty()) // Possibly a race condition where the task is already finished
|
||||||
|
return;
|
||||||
|
|
||||||
|
var theMessage = messages.iterator().next();
|
||||||
|
persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD);
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Tried but failed to set the message for " + processId + " to dead", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Encapsulates the execution of the process in a separate thread so that
|
||||||
|
* we can interrupt the thread if the process is cancelled */
|
||||||
|
private class TaskExecution {
|
||||||
|
private final AtomicBoolean error = new AtomicBoolean(false);
|
||||||
|
public TaskExecution() throws ExecutionException, InterruptedException {
|
||||||
|
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||||
|
executorService.submit(() -> {
|
||||||
|
try {
|
||||||
|
processSpawnerService.trigger(processId);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("Error in triggering process", e);
|
||||||
|
error.set(true);
|
||||||
|
}
|
||||||
|
}).get(); // Wait for the process to start
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isError() {
|
||||||
|
return error.get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -8,7 +8,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
|
|||||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.actor.state.Resume;
|
import nu.marginalia.actor.state.Resume;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit;
|
|||||||
public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||||
|
|
||||||
private final ServiceEventLog eventLogService;
|
private final ServiceEventLog eventLogService;
|
||||||
private final ProcessService processService;
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
private final int node;
|
private final int node;
|
||||||
@@ -49,7 +49,7 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
|||||||
var processId = heartbeat.getProcessId();
|
var processId = heartbeat.getProcessId();
|
||||||
if (null == processId) continue;
|
if (null == processId) continue;
|
||||||
|
|
||||||
if (processService.isRunning(processId) && heartbeat.lastSeenMillis() < 10_000)
|
if (processSpawnerService.isRunning(processId) && heartbeat.lastSeenMillis() < 10_000)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
flagProcessAsStopped(heartbeat);
|
flagProcessAsStopped(heartbeat);
|
||||||
@@ -72,12 +72,12 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
|||||||
public ProcessLivenessMonitorActor(Gson gson,
|
public ProcessLivenessMonitorActor(Gson gson,
|
||||||
ServiceEventLog eventLogService,
|
ServiceEventLog eventLogService,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
ProcessService processService,
|
ProcessSpawnerService processSpawnerService,
|
||||||
HikariDataSource dataSource) {
|
HikariDataSource dataSource) {
|
||||||
super(gson);
|
super(gson);
|
||||||
this.node = configuration.node();
|
this.node = configuration.node();
|
||||||
this.eventLogService = eventLogService;
|
this.eventLogService = eventLogService;
|
||||||
this.processService = processService;
|
this.processSpawnerService = processSpawnerService;
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,8 +208,8 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
|||||||
public boolean isRunning() {
|
public boolean isRunning() {
|
||||||
return "RUNNING".equals(status);
|
return "RUNNING".equals(status);
|
||||||
}
|
}
|
||||||
public ProcessService.ProcessId getProcessId() {
|
public ProcessSpawnerService.ProcessId getProcessId() {
|
||||||
return ProcessService.translateExternalIdBase(processBase);
|
return ProcessSpawnerService.translateExternalIdBase(processBase);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -47,6 +47,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
|
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
|
||||||
|
|
||||||
|
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||||
|
|
||||||
public record Initial() implements ActorStep {}
|
public record Initial() implements ActorStep {}
|
||||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||||
public record Wait(String ts) implements ActorStep {}
|
public record Wait(String ts) implements ActorStep {}
|
||||||
@@ -57,6 +59,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
|||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
case Initial() -> {
|
case Initial() -> {
|
||||||
|
if (!insertFoundDomains) yield new Error("Domain insertion prohibited, aborting");
|
||||||
|
|
||||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||||
yield new Error("Invalid node profile for RSS update");
|
yield new Error("Invalid node profile for RSS update");
|
||||||
}
|
}
|
||||||
|
@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
|
|||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final NodeConfigurationService nodeConfigurationService;
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public UpdateRssActor(Gson gson,
|
public UpdateRssActor(Gson gson,
|
||||||
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
case UpdateRefresh(int count, long msgId) -> {
|
case UpdateRefresh(int count, long msgId) -> {
|
||||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||||
if (msg == null) {
|
if (msg == null) {
|
||||||
// Retry the update
|
logger.warn("UpdateRefresh is taking a very long time");
|
||||||
yield new Error("Failed to update feeds: message not found");
|
yield new UpdateRefresh(count, msgId);
|
||||||
} else if (msg.state() != MqMessageState.OK) {
|
} else if (msg.state() != MqMessageState.OK) {
|
||||||
// Retry the update
|
// Retry the update
|
||||||
yield new Error("Failed to update feeds: " + msg.state());
|
yield new Error("Failed to update feeds: " + msg.state());
|
||||||
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
case UpdateClean(long msgId) -> {
|
case UpdateClean(long msgId) -> {
|
||||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||||
if (msg == null) {
|
if (msg == null) {
|
||||||
// Retry the update
|
logger.warn("UpdateClean is taking a very long time");
|
||||||
yield new Error("Failed to update feeds: message not found");
|
yield new UpdateClean(msgId);
|
||||||
} else if (msg.state() != MqMessageState.OK) {
|
} else if (msg.state() != MqMessageState.OK) {
|
||||||
// Retry the update
|
// Retry the update
|
||||||
yield new Error("Failed to update feeds: " + msg.state());
|
yield new Error("Failed to update feeds: " + msg.state());
|
||||||
|
@@ -3,11 +3,11 @@ package nu.marginalia.actor.task;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.state.ActorControlFlowException;
|
import nu.marginalia.actor.state.ActorControlFlowException;
|
||||||
import nu.marginalia.mq.MqMessageState;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.MqMessage;
|
import nu.marginalia.mq.MqMessage;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -20,13 +20,13 @@ public class ActorProcessWatcher {
|
|||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ActorProcessWatcher.class);
|
private static final Logger logger = LoggerFactory.getLogger(ActorProcessWatcher.class);
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
private final ProcessService processService;
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ActorProcessWatcher(MqPersistence persistence,
|
public ActorProcessWatcher(MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
this.persistence = persistence;
|
this.persistence = persistence;
|
||||||
this.processService = processService;
|
this.processSpawnerService = processSpawnerService;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Wait for a process to start, and then wait for a response from the process,
|
/** Wait for a process to start, and then wait for a response from the process,
|
||||||
@@ -36,7 +36,7 @@ public class ActorProcessWatcher {
|
|||||||
* <p>
|
* <p>
|
||||||
* When interrupted, the process is killed and the message is marked as dead.
|
* When interrupted, the process is killed and the message is marked as dead.
|
||||||
*/
|
*/
|
||||||
public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long msgId)
|
public MqMessage waitResponse(MqOutbox outbox, ProcessSpawnerService.ProcessId processId, long msgId)
|
||||||
throws ActorControlFlowException, InterruptedException, SQLException
|
throws ActorControlFlowException, InterruptedException, SQLException
|
||||||
{
|
{
|
||||||
// enums values only have a single instance,
|
// enums values only have a single instance,
|
||||||
@@ -65,7 +65,7 @@ public class ActorProcessWatcher {
|
|||||||
// This will prevent the monitor process from attempting to respawn the process as we kill it
|
// This will prevent the monitor process from attempting to respawn the process as we kill it
|
||||||
|
|
||||||
outbox.flagAsDead(msgId);
|
outbox.flagAsDead(msgId);
|
||||||
processService.kill(processId);
|
processSpawnerService.kill(processId);
|
||||||
|
|
||||||
logger.info("Process {} killed due to interrupt", processId);
|
logger.info("Process {} killed due to interrupt", processId);
|
||||||
}
|
}
|
||||||
@@ -94,12 +94,12 @@ public class ActorProcessWatcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Wait the specified time for the specified process to start running (does not start the process) */
|
/** Wait the specified time for the specified process to start running (does not start the process) */
|
||||||
private boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
private boolean waitForProcess(ProcessSpawnerService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
||||||
|
|
||||||
// Wait for process to start
|
// Wait for process to start
|
||||||
long deadline = System.currentTimeMillis() + unit.toMillis(duration);
|
long deadline = System.currentTimeMillis() + unit.toMillis(duration);
|
||||||
while (System.currentTimeMillis() < deadline) {
|
while (System.currentTimeMillis() < deadline) {
|
||||||
if (processService.isRunning(processId))
|
if (processSpawnerService.isRunning(processId))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
TimeUnit.MILLISECONDS.sleep(100);
|
TimeUnit.MILLISECONDS.sleep(100);
|
||||||
|
@@ -12,7 +12,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.sideload.RedditSideloadHelper;
|
import nu.marginalia.sideload.RedditSideloadHelper;
|
||||||
import nu.marginalia.sideload.SideloadHelper;
|
import nu.marginalia.sideload.SideloadHelper;
|
||||||
import nu.marginalia.sideload.StackExchangeSideloadHelper;
|
import nu.marginalia.sideload.StackExchangeSideloadHelper;
|
||||||
@@ -218,7 +218,7 @@ public class ConvertActor extends RecordActorPrototype {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
case ConvertWait(FileStorageId destFid, long msgId) -> {
|
case ConvertWait(FileStorageId destFid, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId);
|
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessSpawnerService.ProcessId.CONVERTER, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Converter failed");
|
yield new Error("Converter failed");
|
||||||
|
@@ -18,7 +18,7 @@ import nu.marginalia.mqapi.index.IndexName;
|
|||||||
import nu.marginalia.mqapi.loading.LoadRequest;
|
import nu.marginalia.mqapi.loading.LoadRequest;
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
@@ -95,7 +95,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) when msgId < 0 ->
|
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) when msgId < 0 ->
|
||||||
new Convert(crawlId, processedId, mqConverterOutbox.sendAsync(ConvertRequest.forCrawlData(crawlId, processedId)));
|
new Convert(crawlId, processedId, mqConverterOutbox.sendAsync(ConvertRequest.forCrawlData(crawlId, processedId)));
|
||||||
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) -> {
|
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId);
|
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessSpawnerService.ProcessId.CONVERTER, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK)
|
if (rsp.state() != MqMessageState.OK)
|
||||||
yield new Error("Converter failed");
|
yield new Error("Converter failed");
|
||||||
@@ -129,7 +129,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
yield new Load(processedIds, id);
|
yield new Load(processedIds, id);
|
||||||
}
|
}
|
||||||
case Load(List<FileStorageId> processedIds, long msgId) -> {
|
case Load(List<FileStorageId> processedIds, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, msgId);
|
var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessSpawnerService.ProcessId.LOADER, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Loader failed");
|
yield new Error("Loader failed");
|
||||||
@@ -165,7 +165,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
case ReindexFwd(long id) when id < 0 -> new ReindexFwd(createIndex(IndexName.FORWARD));
|
case ReindexFwd(long id) when id < 0 -> new ReindexFwd(createIndex(IndexName.FORWARD));
|
||||||
case ReindexFwd(long id) -> {
|
case ReindexFwd(long id) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK)
|
if (rsp.state() != MqMessageState.OK)
|
||||||
yield new Error("Forward index construction failed");
|
yield new Error("Forward index construction failed");
|
||||||
@@ -174,7 +174,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
case ReindexFull(long id) when id < 0 -> new ReindexFull(createIndex(IndexName.REVERSE_FULL));
|
case ReindexFull(long id) when id < 0 -> new ReindexFull(createIndex(IndexName.REVERSE_FULL));
|
||||||
case ReindexFull(long id) -> {
|
case ReindexFull(long id) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK)
|
if (rsp.state() != MqMessageState.OK)
|
||||||
yield new Error("Full index construction failed");
|
yield new Error("Full index construction failed");
|
||||||
@@ -183,7 +183,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
case ReindexPrio(long id) when id < 0 -> new ReindexPrio(createIndex(IndexName.REVERSE_PRIO));
|
case ReindexPrio(long id) when id < 0 -> new ReindexPrio(createIndex(IndexName.REVERSE_PRIO));
|
||||||
case ReindexPrio(long id) -> {
|
case ReindexPrio(long id) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK)
|
if (rsp.state() != MqMessageState.OK)
|
||||||
yield new Error("Prio index construction failed");
|
yield new Error("Prio index construction failed");
|
||||||
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
@@ -76,7 +76,7 @@ public class CrawlActor extends RecordActorPrototype {
|
|||||||
case Crawl (long msgId, FileStorageId fid, boolean cascadeLoad) -> {
|
case Crawl (long msgId, FileStorageId fid, boolean cascadeLoad) -> {
|
||||||
var rsp = processWatcher.waitResponse(
|
var rsp = processWatcher.waitResponse(
|
||||||
mqCrawlerOutbox,
|
mqCrawlerOutbox,
|
||||||
ProcessService.ProcessId.CRAWLER,
|
ProcessSpawnerService.ProcessId.CRAWLER,
|
||||||
msgId);
|
msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
|||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.actor.state.Resume;
|
import nu.marginalia.actor.state.Resume;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
private final ServiceEventLog eventLog;
|
private final ServiceEventLog eventLog;
|
||||||
|
private final ServiceHeartbeat heartbeat;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Resume(behavior = ActorResumeBehavior.ERROR)
|
@Resume(behavior = ActorResumeBehavior.ERROR)
|
||||||
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
Files.deleteIfExists(Path.of(tarFileName));
|
Files.deleteIfExists(Path.of(tarFileName));
|
||||||
|
|
||||||
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
|
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
|
||||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
|
||||||
is.transferTo(os);
|
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
|
||||||
|
long size = urlConnection.getContentLengthLong();
|
||||||
|
byte[] buffer = new byte[8192];
|
||||||
|
|
||||||
|
try (var is = new BufferedInputStream(urlConnection.getInputStream());
|
||||||
|
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||||
|
long copiedSize = 0;
|
||||||
|
|
||||||
|
while (copiedSize < size) {
|
||||||
|
int read = is.read(buffer);
|
||||||
|
|
||||||
|
if (read < 0) // We've been promised a file of length 'size'
|
||||||
|
throw new IOException("Unexpected end of stream");
|
||||||
|
|
||||||
|
os.write(buffer, 0, read);
|
||||||
|
copiedSize += read;
|
||||||
|
|
||||||
|
// Update progress bar
|
||||||
|
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
||||||
logger.error("Error downloading sample", ex);
|
logger.error("Error downloading sample", ex);
|
||||||
yield new Error();
|
yield new Error();
|
||||||
}
|
}
|
||||||
|
finally {
|
||||||
|
urlConnection.disconnect();
|
||||||
|
}
|
||||||
|
|
||||||
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
||||||
yield new Extract(fileStorageId, tarFileName);
|
yield new Extract(fileStorageId, tarFileName);
|
||||||
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
@Inject
|
@Inject
|
||||||
public DownloadSampleActor(Gson gson,
|
public DownloadSampleActor(Gson gson,
|
||||||
FileStorageService storageService,
|
FileStorageService storageService,
|
||||||
ServiceEventLog eventLog)
|
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
|
||||||
{
|
{
|
||||||
super(gson);
|
super(gson);
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
this.eventLog = eventLog;
|
this.eventLog = eventLog;
|
||||||
|
this.heartbeat = heartbeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
|||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@@ -55,7 +55,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) -> {
|
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
storageService.flagFileForDeletion(destId);
|
storageService.flagFileForDeletion(destId);
|
||||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
|||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@@ -54,7 +54,7 @@ public class ExportFeedsActor extends RecordActorPrototype {
|
|||||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
storageService.flagFileForDeletion(destId);
|
storageService.flagFileForDeletion(destId);
|
||||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@@ -26,33 +26,33 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
private final MqOutbox exportTasksOutbox;
|
private final MqOutbox exportTasksOutbox;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
|
||||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
|
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
|
||||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
|
||||||
this(crawlId, destId, size, name, -1);
|
this(crawlId, destId, size, name, ctFilter,-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
|
||||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||||
"crawl-sample-export",
|
"crawl-sample-export",
|
||||||
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
||||||
);
|
);
|
||||||
|
|
||||||
if (storage == null) yield new Error("Bad storage id");
|
if (storage == null) yield new Error("Bad storage id");
|
||||||
yield new Run(crawlId, storage.id(), size, name);
|
yield new Run(crawlId, storage.id(), size, ctFilter, name);
|
||||||
}
|
}
|
||||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
|
case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
|
||||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||||
|
|
||||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
|
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
|
||||||
yield new Run(crawlId, destId, size, name, newMsgId);
|
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(_, FileStorageId destId, _, _, long msgId) -> {
|
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
storageService.flagFileForDeletion(destId);
|
storageService.flagFileForDeletion(destId);
|
||||||
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String describe() {
|
public String describe() {
|
||||||
return "Export RSS/Atom feeds from crawl data";
|
return "Export sample crawl data";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
|||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@@ -52,7 +52,7 @@ public class ExportTermFreqActor extends RecordActorPrototype {
|
|||||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
storageService.flagFileForDeletion(destId);
|
storageService.flagFileForDeletion(destId);
|
||||||
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -44,7 +44,6 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
logger.info("{}", self);
|
|
||||||
return switch (self) {
|
return switch (self) {
|
||||||
case Initial() -> {
|
case Initial() -> {
|
||||||
yield new Monitor("-");
|
yield new Monitor("-");
|
||||||
@@ -75,7 +74,7 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
|||||||
yield new LiveCrawl(feedsHash, id);
|
yield new LiveCrawl(feedsHash, id);
|
||||||
}
|
}
|
||||||
case LiveCrawl(String feedsHash, long msgId) -> {
|
case LiveCrawl(String feedsHash, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessService.ProcessId.LIVE_CRAWLER, msgId);
|
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessSpawnerService.ProcessId.LIVE_CRAWLER, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Crawler failed");
|
yield new Error("Crawler failed");
|
||||||
|
@@ -0,0 +1,150 @@
|
|||||||
|
package nu.marginalia.actor.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import jakarta.inject.Inject;
|
||||||
|
import jakarta.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
|
import nu.marginalia.process.log.WorkLog;
|
||||||
|
import nu.marginalia.process.log.WorkLogEntry;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
|
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
|
import org.apache.logging.log4j.util.Strings;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class MigrateCrawlDataActor extends RecordActorPrototype {
|
||||||
|
|
||||||
|
private final FileStorageService fileStorageService;
|
||||||
|
private final ServiceHeartbeat serviceHeartbeat;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService, ServiceHeartbeat serviceHeartbeat) {
|
||||||
|
super(gson);
|
||||||
|
|
||||||
|
this.fileStorageService = fileStorageService;
|
||||||
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
|
}
|
||||||
|
|
||||||
|
public record Run(long fileStorageId) implements ActorStep {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
return switch (self) {
|
||||||
|
case Run(long fileStorageId) -> {
|
||||||
|
|
||||||
|
FileStorage storage = fileStorageService.getStorage(FileStorageId.of(fileStorageId));
|
||||||
|
Path root = storage.asPath();
|
||||||
|
|
||||||
|
Path crawlerLog = root.resolve("crawler.log");
|
||||||
|
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
||||||
|
|
||||||
|
int totalEntries = WorkLog.countEntries(crawlerLog);
|
||||||
|
|
||||||
|
try (WorkLog workLog = new WorkLog(newCrawlerLog);
|
||||||
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Migrating")
|
||||||
|
) {
|
||||||
|
int entryIdx = 0;
|
||||||
|
|
||||||
|
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
||||||
|
|
||||||
|
final WorkLogEntry entry = item.getKey();
|
||||||
|
final Path inputPath = item.getValue();
|
||||||
|
|
||||||
|
Path outputPath = inputPath;
|
||||||
|
heartbeat.progress("Migrating" + inputPath.getFileName(), entryIdx++, totalEntries);
|
||||||
|
|
||||||
|
if (inputPath.toString().endsWith(".parquet")) {
|
||||||
|
String domain = entry.id();
|
||||||
|
String id = Integer.toHexString(domain.hashCode());
|
||||||
|
|
||||||
|
outputPath = CrawlerOutputFile.createSlopPath(root, id, domain);
|
||||||
|
|
||||||
|
if (Files.exists(inputPath)) {
|
||||||
|
try {
|
||||||
|
SlopCrawlDataRecord.convertFromParquet(inputPath, outputPath);
|
||||||
|
Files.deleteIfExists(inputPath);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
outputPath = inputPath; // don't update the work log on error
|
||||||
|
logger.error("Failed to convert " + inputPath, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!Files.exists(inputPath) && !Files.exists(outputPath)) {
|
||||||
|
// if the input file is missing, and the output file is missing, we just write the log
|
||||||
|
// record identical to the old one
|
||||||
|
outputPath = inputPath;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write a log entry for the (possibly) converted file
|
||||||
|
workLog.setJobToFinished(entry.id(), outputPath.toString(), entry.cnt());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
||||||
|
Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
Files.move(newCrawlerLog, crawlerLog);
|
||||||
|
|
||||||
|
yield new End();
|
||||||
|
}
|
||||||
|
default -> new Error();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class CrawlDataLocator implements Function<WorkLogEntry, Optional<Map.Entry<WorkLogEntry, Path>>> {
|
||||||
|
|
||||||
|
private final Path crawlRootDir;
|
||||||
|
|
||||||
|
CrawlDataLocator(Path crawlRootDir) {
|
||||||
|
this.crawlRootDir = crawlRootDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<Map.Entry<WorkLogEntry, Path>> apply(WorkLogEntry entry) {
|
||||||
|
var path = getCrawledFilePath(crawlRootDir, entry.path());
|
||||||
|
|
||||||
|
if (!Files.exists(path)) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return Optional.of(Map.entry(entry, path));
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path getCrawledFilePath(Path crawlDir, String fileName) {
|
||||||
|
int sp = fileName.lastIndexOf('/');
|
||||||
|
|
||||||
|
// Normalize the filename
|
||||||
|
if (sp >= 0 && sp + 1< fileName.length())
|
||||||
|
fileName = fileName.substring(sp + 1);
|
||||||
|
if (fileName.length() < 4)
|
||||||
|
fileName = Strings.repeat("0", 4 - fileName.length()) + fileName;
|
||||||
|
|
||||||
|
String sp1 = fileName.substring(0, 2);
|
||||||
|
String sp2 = fileName.substring(2, 4);
|
||||||
|
return crawlDir.resolve(sp1).resolve(sp2).resolve(fileName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String describe() {
|
||||||
|
return "Migrates crawl data to the latest format";
|
||||||
|
}
|
||||||
|
}
|
@@ -11,7 +11,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
@@ -51,7 +51,7 @@ public class RecrawlSingleDomainActor extends RecordActorPrototype {
|
|||||||
case Crawl (long msgId) -> {
|
case Crawl (long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(
|
var rsp = processWatcher.waitResponse(
|
||||||
mqCrawlerOutbox,
|
mqCrawlerOutbox,
|
||||||
ProcessService.ProcessId.CRAWLER,
|
ProcessSpawnerService.ProcessId.CRAWLER,
|
||||||
msgId);
|
msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -34,7 +34,7 @@ public class TriggerAdjacencyCalculationActor extends RecordActorPrototype {
|
|||||||
yield new Run(newMsgId);
|
yield new Run(newMsgId);
|
||||||
}
|
}
|
||||||
case Run(long msgId) -> {
|
case Run(long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Exporter failed");
|
yield new Error("Exporter failed");
|
||||||
|
@@ -0,0 +1,60 @@
|
|||||||
|
package nu.marginalia.actor.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||||
|
private final ServiceConfiguration serviceConfiguration;
|
||||||
|
private final NsfwDomainFilter nsfwDomainFilter;
|
||||||
|
private final MqPersistence persistence;
|
||||||
|
|
||||||
|
public record Initial(long respondMsgId) implements ActorStep {}
|
||||||
|
public record Run(long respondMsgId) implements ActorStep {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
return switch(self) {
|
||||||
|
case Initial(long respondMsgId) -> {
|
||||||
|
if (serviceConfiguration.node() != 1) {
|
||||||
|
persistence.updateMessageState(respondMsgId, MqMessageState.ERR);
|
||||||
|
yield new Error("This actor can only run on node 1");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
yield new Run(respondMsgId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case Run(long respondMsgId) -> {
|
||||||
|
nsfwDomainFilter.fetchLists();
|
||||||
|
persistence.updateMessageState(respondMsgId, MqMessageState.OK);
|
||||||
|
yield new End();
|
||||||
|
}
|
||||||
|
default -> new Error();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String describe() {
|
||||||
|
return "Sync NSFW filters";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public UpdateNsfwFiltersActor(Gson gson,
|
||||||
|
ServiceConfiguration serviceConfiguration,
|
||||||
|
NsfwDomainFilter nsfwDomainFilter,
|
||||||
|
MqPersistence persistence)
|
||||||
|
{
|
||||||
|
super(gson);
|
||||||
|
this.serviceConfiguration = serviceConfiguration;
|
||||||
|
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||||
|
this.persistence = persistence;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
|
|||||||
new ExportSampleDataActor.Export(
|
new ExportSampleDataActor.Export(
|
||||||
FileStorageId.of(request.getFileStorageId()),
|
FileStorageId.of(request.getFileStorageId()),
|
||||||
request.getSize(),
|
request.getSize(),
|
||||||
|
request.getCtFilter(),
|
||||||
request.getName()
|
request.getName()
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
@@ -10,6 +10,7 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
|||||||
import nu.marginalia.actor.task.DownloadSampleActor;
|
import nu.marginalia.actor.task.DownloadSampleActor;
|
||||||
import nu.marginalia.actor.task.RestoreBackupActor;
|
import nu.marginalia.actor.task.RestoreBackupActor;
|
||||||
import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
|
import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
|
||||||
|
import nu.marginalia.actor.task.UpdateNsfwFiltersActor;
|
||||||
import nu.marginalia.functions.execution.api.*;
|
import nu.marginalia.functions.execution.api.*;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import nu.marginalia.service.server.DiscoverableService;
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
@@ -263,4 +264,19 @@ public class ExecutorGrpcService
|
|||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void updateNsfwFilters(RpcUpdateNsfwFilters request, StreamObserver<Empty> responseObserver) {
|
||||||
|
logger.info("Got request {}", request);
|
||||||
|
try {
|
||||||
|
actorControlService.startFrom(ExecutorActor.UPDATE_NSFW_LISTS,
|
||||||
|
new UpdateNsfwFiltersActor.Initial(request.getMsgId()));
|
||||||
|
|
||||||
|
responseObserver.onNext(Empty.getDefaultInstance());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Failed to update nsfw filters", e);
|
||||||
|
responseObserver.onError(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -8,6 +8,8 @@ import nu.marginalia.crawl.CrawlerMain;
|
|||||||
import nu.marginalia.index.IndexConstructorMain;
|
import nu.marginalia.index.IndexConstructorMain;
|
||||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||||
import nu.marginalia.loading.LoaderMain;
|
import nu.marginalia.loading.LoaderMain;
|
||||||
|
import nu.marginalia.ndp.NdpMain;
|
||||||
|
import nu.marginalia.ping.PingMain;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.server.BaseServiceParams;
|
import nu.marginalia.service.server.BaseServiceParams;
|
||||||
import nu.marginalia.task.ExportTasksMain;
|
import nu.marginalia.task.ExportTasksMain;
|
||||||
@@ -27,7 +29,7 @@ import java.util.List;
|
|||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ProcessService {
|
public class ProcessSpawnerService {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final Marker processMarker = MarkerFactory.getMarker("PROCESS");
|
private final Marker processMarker = MarkerFactory.getMarker("PROCESS");
|
||||||
|
|
||||||
@@ -41,6 +43,7 @@ public class ProcessService {
|
|||||||
return switch (id) {
|
return switch (id) {
|
||||||
case "converter" -> ProcessId.CONVERTER;
|
case "converter" -> ProcessId.CONVERTER;
|
||||||
case "crawler" -> ProcessId.CRAWLER;
|
case "crawler" -> ProcessId.CRAWLER;
|
||||||
|
case "ping" -> ProcessId.PING;
|
||||||
case "loader" -> ProcessId.LOADER;
|
case "loader" -> ProcessId.LOADER;
|
||||||
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||||
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||||
@@ -50,10 +53,12 @@ public class ProcessService {
|
|||||||
|
|
||||||
public enum ProcessId {
|
public enum ProcessId {
|
||||||
CRAWLER(CrawlerMain.class),
|
CRAWLER(CrawlerMain.class),
|
||||||
|
PING(PingMain.class),
|
||||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||||
CONVERTER(ConverterMain.class),
|
CONVERTER(ConverterMain.class),
|
||||||
LOADER(LoaderMain.class),
|
LOADER(LoaderMain.class),
|
||||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||||
|
NDP(NdpMain.class),
|
||||||
EXPORT_TASKS(ExportTasksMain.class),
|
EXPORT_TASKS(ExportTasksMain.class),
|
||||||
;
|
;
|
||||||
|
|
||||||
@@ -68,6 +73,8 @@ public class ProcessService {
|
|||||||
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||||
|
case PING -> "PING_PROCESS_OPTS";
|
||||||
|
case NDP -> "NDP_PROCESS_OPTS";
|
||||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||||
};
|
};
|
||||||
@@ -81,7 +88,7 @@ public class ProcessService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ProcessService(BaseServiceParams params) {
|
public ProcessSpawnerService(BaseServiceParams params) {
|
||||||
this.eventLog = params.eventLog;
|
this.eventLog = params.eventLog;
|
||||||
this.node = params.configuration.node();
|
this.node = params.configuration.node();
|
||||||
}
|
}
|
@@ -27,10 +27,12 @@ public class DbBrowseDomainsRandom {
|
|||||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||||
|
|
||||||
final String q = """
|
final String q = """
|
||||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
SELECT EC_RANDOM_DOMAINS.DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||||
FROM EC_RANDOM_DOMAINS
|
FROM EC_RANDOM_DOMAINS
|
||||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||||
|
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION DAI ON DAI.DOMAIN_ID=EC_RANDOM_DOMAINS.DOMAIN_ID
|
||||||
WHERE STATE<2
|
WHERE STATE<2
|
||||||
|
AND SERVER_AVAILABLE
|
||||||
AND DOMAIN_SET=?
|
AND DOMAIN_SET=?
|
||||||
AND DOMAIN_ALIAS IS NULL
|
AND DOMAIN_ALIAS IS NULL
|
||||||
ORDER BY RAND()
|
ORDER BY RAND()
|
||||||
|
47
code/functions/favicon/api/build.gradle
Normal file
47
code/functions/favicon/api/build.gradle
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id "com.google.protobuf" version "0.9.4"
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
jar.archiveBaseName = 'favicon-api'
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation libs.gson
|
||||||
|
implementation libs.bundles.protobuf
|
||||||
|
implementation libs.guava
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,39 @@
|
|||||||
|
package nu.marginalia.api.favicon;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
|
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||||
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class FaviconClient {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(FaviconClient.class);
|
||||||
|
|
||||||
|
private final GrpcMultiNodeChannelPool<FaviconAPIGrpc.FaviconAPIBlockingStub> channelPool;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FaviconClient(GrpcChannelPoolFactory factory) {
|
||||||
|
this.channelPool = factory.createMulti(
|
||||||
|
ServiceKey.forGrpcApi(FaviconAPIGrpc.class, ServicePartition.multi()),
|
||||||
|
FaviconAPIGrpc::newBlockingStub);
|
||||||
|
}
|
||||||
|
|
||||||
|
public record FaviconData(byte[] bytes, String contentType) {}
|
||||||
|
|
||||||
|
|
||||||
|
public Optional<FaviconData> getFavicon(String domain, int node) {
|
||||||
|
RpcFaviconResponse rsp = channelPool.call(FaviconAPIGrpc.FaviconAPIBlockingStub::getFavicon)
|
||||||
|
.forNode(node)
|
||||||
|
.run(RpcFaviconRequest.newBuilder().setDomain(domain).build());
|
||||||
|
|
||||||
|
if (rsp.getData().isEmpty())
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
return Optional.of(new FaviconData(rsp.getData().toByteArray(), rsp.getContentType()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
syntax="proto3";
|
||||||
|
package marginalia.api.favicon;
|
||||||
|
|
||||||
|
option java_package="nu.marginalia.api.favicon";
|
||||||
|
option java_multiple_files=true;
|
||||||
|
|
||||||
|
service FaviconAPI {
|
||||||
|
/** Fetches information about a domain. */
|
||||||
|
rpc getFavicon(RpcFaviconRequest) returns (RpcFaviconResponse) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFaviconRequest {
|
||||||
|
string domain = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFaviconResponse {
|
||||||
|
string domain = 1;
|
||||||
|
bytes data = 2;
|
||||||
|
string contentType = 3;
|
||||||
|
}
|
49
code/functions/favicon/build.gradle
Normal file
49
code/functions/favicon/build.gradle
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id 'application'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:functions:favicon:api')
|
||||||
|
implementation project(':code:processes:crawling-process')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.guava
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation dependencies.create(libs.spark.get()) {
|
||||||
|
exclude group: 'org.eclipse.jetty'
|
||||||
|
}
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.functions.favicon;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.protobuf.ByteString;
|
||||||
|
import io.grpc.stub.StreamObserver;
|
||||||
|
import nu.marginalia.api.favicon.FaviconAPIGrpc;
|
||||||
|
import nu.marginalia.api.favicon.RpcFaviconRequest;
|
||||||
|
import nu.marginalia.api.favicon.RpcFaviconResponse;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implements DiscoverableService {
|
||||||
|
private final DomainStateDb domainStateDb;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FaviconGrpcService(DomainStateDb domainStateDb) {
|
||||||
|
this.domainStateDb = domainStateDb;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean shouldRegisterService() {
|
||||||
|
return domainStateDb.isAvailable();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
|
||||||
|
Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());
|
||||||
|
|
||||||
|
RpcFaviconResponse response;
|
||||||
|
if (icon.isEmpty()) {
|
||||||
|
response = RpcFaviconResponse.newBuilder().build();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var iconRecord = icon.get();
|
||||||
|
response = RpcFaviconResponse.newBuilder()
|
||||||
|
.setContentType(iconRecord.contentType())
|
||||||
|
.setDomain(request.getDomain())
|
||||||
|
.setData(ByteString.copyFrom(iconRecord.imageData()))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
responseObserver.onNext(response);
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
}
|
@@ -11,6 +11,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
|||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
import javax.annotation.CheckReturnValue;
|
import javax.annotation.CheckReturnValue;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -59,6 +60,11 @@ public class FeedsClient {
|
|||||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean waitReady(Duration duration) throws InterruptedException {
|
||||||
|
return channelPool.awaitChannel(duration);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||||
public String getFeedDataHash() {
|
public String getFeedDataHash() {
|
||||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||||
|
@@ -22,17 +22,21 @@ dependencies {
|
|||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:libraries:blocking-thread-pool')
|
implementation project(':code:libraries:blocking-thread-pool')
|
||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
implementation project(':code:libraries:domain-lock')
|
||||||
|
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
|
implementation project(':third-party:rssreader')
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.rssreader
|
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
|
implementation libs.slop
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
|
implementation libs.httpclient
|
||||||
|
implementation libs.wiremock
|
||||||
|
|
||||||
implementation libs.prometheus
|
implementation libs.prometheus
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
@@ -55,8 +59,6 @@ dependencies {
|
|||||||
implementation libs.bundles.gson
|
implementation libs.bundles.gson
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
@@ -0,0 +1,126 @@
|
|||||||
|
package nu.marginalia.domsample;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import jakarta.inject.Named;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
|
import nu.marginalia.livecapture.BrowserlessClient;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
public class DomSampleService {
|
||||||
|
private final DomSampleDb db;
|
||||||
|
private final HikariDataSource mariadbDataSource;
|
||||||
|
private final URI browserlessURI;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomSampleService(DomSampleDb db,
|
||||||
|
HikariDataSource mariadbDataSource,
|
||||||
|
@Named("browserless-uri") String browserlessAddress,
|
||||||
|
ServiceConfiguration serviceConfiguration)
|
||||||
|
throws URISyntaxException
|
||||||
|
{
|
||||||
|
this.db = db;
|
||||||
|
this.mariadbDataSource = mariadbDataSource;
|
||||||
|
|
||||||
|
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||||
|
logger.warn("Live capture service will not run");
|
||||||
|
browserlessURI = null;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
browserlessURI = new URI(browserlessAddress);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void start() {
|
||||||
|
if (browserlessURI == null) {
|
||||||
|
logger.warn("DomSampleService is not enabled due to missing browserless URI or multi-node configuration");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon().start(this::run);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void syncDomains() {
|
||||||
|
Set<String> dbDomains = new HashSet<>();
|
||||||
|
|
||||||
|
logger.info("Fetching domains from database...");
|
||||||
|
|
||||||
|
try (var conn = mariadbDataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_NAME
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE NODE_AFFINITY>0
|
||||||
|
""")
|
||||||
|
) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
dbDomains.add(rs.getString("DOMAIN_NAME"));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("Failed to sync domains", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Found {} domains in database", dbDomains.size());
|
||||||
|
|
||||||
|
db.syncDomains(dbDomains);
|
||||||
|
|
||||||
|
logger.info("Synced domains to sqlite");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
|
||||||
|
while (!Thread.currentThread().isInterrupted()) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Grace sleep in case we're operating on an empty domain list
|
||||||
|
TimeUnit.SECONDS.sleep(15);
|
||||||
|
|
||||||
|
syncDomains();
|
||||||
|
var domains = db.getScheduledDomains();
|
||||||
|
|
||||||
|
for (var domain : domains) {
|
||||||
|
updateDomain(client, domain);
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
logger.info("DomSampleService interrupted, stopping...");
|
||||||
|
return;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error in DomSampleService run loop", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateDomain(BrowserlessClient client, String domain) {
|
||||||
|
var rootUrl = "https://" + domain + "/";
|
||||||
|
try {
|
||||||
|
var content = client.annotatedContent(rootUrl, new BrowserlessClient.GotoOptions("load", Duration.ofSeconds(10).toMillis()));
|
||||||
|
|
||||||
|
if (content.isPresent()) {
|
||||||
|
db.saveSample(domain, rootUrl, content.get());
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Failed to process domain: " + domain, e);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
db.flagDomainAsFetched(domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,174 @@
|
|||||||
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class DomSampleDb implements AutoCloseable {
|
||||||
|
private static final String dbFileName = "dom-sample.db";
|
||||||
|
private final Connection connection;
|
||||||
|
|
||||||
|
public DomSampleDb() throws SQLException{
|
||||||
|
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||||
|
}
|
||||||
|
|
||||||
|
public DomSampleDb(Path dbPath) throws SQLException {
|
||||||
|
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||||
|
|
||||||
|
connection = DriverManager.getConnection(dbUrl);
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||||
|
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||||
|
stmt.execute("PRAGMA journal_mode=WAL");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void syncDomains(Set<String> domains) {
|
||||||
|
Set<String> currentDomains = new HashSet<>();
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
currentDomains.add(rs.getString("domain"));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to sync domains", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> toRemove = new HashSet<>(currentDomains);
|
||||||
|
Set<String> toAdd = new HashSet<>(domains);
|
||||||
|
|
||||||
|
toRemove.removeAll(domains);
|
||||||
|
toAdd.removeAll(currentDomains);
|
||||||
|
|
||||||
|
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
|
||||||
|
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
|
||||||
|
) {
|
||||||
|
for (String domain : toRemove) {
|
||||||
|
removeStmt.setString(1, domain);
|
||||||
|
removeStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String domain : toAdd) {
|
||||||
|
addStmt.setString(1, domain);
|
||||||
|
addStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to remove domains", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getScheduledDomains() {
|
||||||
|
List<String> domains = new ArrayList<>();
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
domains.add(rs.getString("domain"));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to get scheduled domains", e);
|
||||||
|
}
|
||||||
|
return domains;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void flagDomainAsFetched(String domain) {
|
||||||
|
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to flag domain as fetched", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||||
|
|
||||||
|
public List<Sample> getSamples(String domain) throws SQLException {
|
||||||
|
List<Sample> samples = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT url, sample, requests, accepted_popover
|
||||||
|
FROM samples
|
||||||
|
WHERE domain = ?
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
samples.add(
|
||||||
|
new Sample(
|
||||||
|
rs.getString("url"),
|
||||||
|
domain,
|
||||||
|
rs.getString("sample"),
|
||||||
|
rs.getString("requests"),
|
||||||
|
rs.getBoolean("accepted_popover")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||||
|
var doc = Jsoup.parse(rawContent);
|
||||||
|
|
||||||
|
var networkRequests = doc.getElementById("marginalia-network-requests");
|
||||||
|
|
||||||
|
boolean acceptedPopover = false;
|
||||||
|
|
||||||
|
StringBuilder requestTsv = new StringBuilder();
|
||||||
|
if (networkRequests != null) {
|
||||||
|
|
||||||
|
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
|
||||||
|
|
||||||
|
for (var request : networkRequests.getElementsByClass("network-request")) {
|
||||||
|
String method = request.attr("data-method");
|
||||||
|
String urlAttr = request.attr("data-url");
|
||||||
|
String timestamp = request.attr("data-timestamp");
|
||||||
|
|
||||||
|
requestTsv
|
||||||
|
.append(method)
|
||||||
|
.append('\t')
|
||||||
|
.append(timestamp)
|
||||||
|
.append('\t')
|
||||||
|
.append(urlAttr.replace('\n', ' '))
|
||||||
|
.append("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
networkRequests.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
doc.body().removeAttr("id");
|
||||||
|
|
||||||
|
String sample = doc.html();
|
||||||
|
|
||||||
|
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE
|
||||||
|
INTO samples (domain, url, sample, requests, accepted_popover)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.setString(2, url);
|
||||||
|
stmt.setString(3, sample);
|
||||||
|
stmt.setString(4, requests);
|
||||||
|
stmt.setBoolean(5, acceptedPopover);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws SQLException {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
}
|
@@ -1,21 +1,28 @@
|
|||||||
package nu.marginalia.livecapture;
|
package nu.marginalia.livecapture;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.net.URLEncoder;
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
/** Client for local browserless.io API */
|
/** Client for local browserless.io API */
|
||||||
public class BrowserlessClient implements AutoCloseable {
|
public class BrowserlessClient implements AutoCloseable {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
||||||
|
private static final String BROWSERLESS_TOKEN = System.getProperty("live-capture.browserless-token", "BROWSERLESS_TOKEN");
|
||||||
|
|
||||||
private final HttpClient httpClient = HttpClient.newBuilder()
|
private final HttpClient httpClient = HttpClient.newBuilder()
|
||||||
.version(HttpClient.Version.HTTP_1_1)
|
.version(HttpClient.Version.HTTP_1_1)
|
||||||
@@ -25,18 +32,21 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
private final URI browserlessURI;
|
private final URI browserlessURI;
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private final String userAgent = WmsaHome.getUserAgent().uaString();
|
||||||
|
|
||||||
public BrowserlessClient(URI browserlessURI) {
|
public BrowserlessClient(URI browserlessURI) {
|
||||||
this.browserlessURI = browserlessURI;
|
this.browserlessURI = browserlessURI;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", url,
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
"gotoOptions", gotoOptions
|
"gotoOptions", gotoOptions
|
||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(browserlessURI.resolve("/content"))
|
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
@@ -47,10 +57,46 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
|
|
||||||
if (rsp.statusCode() >= 300) {
|
if (rsp.statusCode() >= 300) {
|
||||||
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
|
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
|
||||||
return null;
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
return rsp.body();
|
return Optional.of(rsp.body());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
|
||||||
|
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
|
||||||
|
*/
|
||||||
|
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||||
|
Map<String, Object> requestData = Map.of(
|
||||||
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
|
"gotoOptions", gotoOptions,
|
||||||
|
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Launch parameters for the browserless instance to load the extension
|
||||||
|
Map<String, Object> launchParameters = Map.of(
|
||||||
|
"args", List.of("--load-extension=/dom-export")
|
||||||
|
);
|
||||||
|
|
||||||
|
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
var request = HttpRequest.newBuilder()
|
||||||
|
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
|
||||||
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
|
gson.toJson(requestData)
|
||||||
|
))
|
||||||
|
.header("Content-type", "application/json")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
|
||||||
|
if (rsp.statusCode() >= 300) {
|
||||||
|
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(rsp.body());
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||||
@@ -58,12 +104,13 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
|
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", url,
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
"options", screenshotOptions,
|
"options", screenshotOptions,
|
||||||
"gotoOptions", gotoOptions
|
"gotoOptions", gotoOptions
|
||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(browserlessURI.resolve("/screenshot"))
|
.uri(browserlessURI.resolve("/screenshot?token="+BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
@@ -82,7 +129,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() {
|
||||||
httpClient.shutdownNow();
|
httpClient.shutdownNow();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -126,7 +126,6 @@ public class LiveCaptureGrpcService
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
EdgeDomain domain = domainNameOpt.get();
|
EdgeDomain domain = domainNameOpt.get();
|
||||||
String domainNameStr = domain.toString();
|
|
||||||
|
|
||||||
if (!isValidDomainForCapture(domain)) {
|
if (!isValidDomainForCapture(domain)) {
|
||||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.rss.model;
|
package nu.marginalia.rss.model;
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
import nu.marginalia.rss.svc.SimpleFeedParser;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@@ -18,37 +18,33 @@ public record FeedItem(String title,
|
|||||||
public static final int MAX_DESC_LENGTH = 255;
|
public static final int MAX_DESC_LENGTH = 255;
|
||||||
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
||||||
|
|
||||||
public static FeedItem fromItem(Item item, boolean keepFragment) {
|
public static FeedItem fromItem(SimpleFeedParser.ItemData item, boolean keepFragment) {
|
||||||
String title = item.getTitle().orElse("");
|
String title = item.title();
|
||||||
String date = getItemDate(item);
|
String date = getItemDate(item);
|
||||||
String description = getItemDescription(item);
|
String description = getItemDescription(item);
|
||||||
String url;
|
String url;
|
||||||
|
|
||||||
if (keepFragment || item.getLink().isEmpty()) {
|
if (keepFragment) {
|
||||||
url = item.getLink().orElse("");
|
url = item.url();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
try {
|
try {
|
||||||
String link = item.getLink().get();
|
String link = item.url();
|
||||||
var linkUri = new URI(link);
|
var linkUri = new URI(link);
|
||||||
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
||||||
url = cleanUri.toString();
|
url = cleanUri.toString();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
// fallback to original link if we can't clean it, this is not a very important step
|
// fallback to original link if we can't clean it, this is not a very important step
|
||||||
url = item.getLink().get();
|
url = item.url();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FeedItem(title, date, description, url);
|
return new FeedItem(title, date, description, url);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getItemDescription(Item item) {
|
private static String getItemDescription(SimpleFeedParser.ItemData item) {
|
||||||
Optional<String> description = item.getDescription();
|
String rawDescription = item.description();
|
||||||
if (description.isEmpty())
|
|
||||||
return "";
|
|
||||||
|
|
||||||
String rawDescription = description.get();
|
|
||||||
if (rawDescription.indexOf('<') >= 0) {
|
if (rawDescription.indexOf('<') >= 0) {
|
||||||
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
||||||
}
|
}
|
||||||
@@ -58,15 +54,18 @@ public record FeedItem(String title,
|
|||||||
|
|
||||||
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
||||||
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
||||||
private static String getItemDate(Item item) {
|
private static String getItemDate(SimpleFeedParser.ItemData item) {
|
||||||
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
||||||
try {
|
try {
|
||||||
zonedDateTime = item.getPubDateZonedDateTime();
|
zonedDateTime = item.getPubDateZonedDateTime();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
zonedDateTime = item.getPubDate()
|
try {
|
||||||
.map(extraFormatter::parse)
|
zonedDateTime = Optional.of(ZonedDateTime.from(extraFormatter.parse(item.pubDate())));
|
||||||
.map(ZonedDateTime::from);
|
}
|
||||||
|
catch (Exception e2) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
||||||
|
@@ -1,66 +0,0 @@
|
|||||||
package nu.marginalia.rss.svc;
|
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.concurrent.Semaphore;
|
|
||||||
|
|
||||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
|
||||||
* crawling the same domain at the same time.
|
|
||||||
*/
|
|
||||||
public class DomainLocks {
|
|
||||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
|
||||||
// relatively big, but should be manageable since the number of domains is limited to
|
|
||||||
// a few hundred thousand typically.
|
|
||||||
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
|
||||||
|
|
||||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
|
||||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
|
||||||
*/
|
|
||||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
|
||||||
return new DomainLock(domain.toString(),
|
|
||||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
|
||||||
}
|
|
||||||
|
|
||||||
private Semaphore defaultPermits(String topDomain) {
|
|
||||||
if (topDomain.equals("wordpress.com"))
|
|
||||||
return new Semaphore(16);
|
|
||||||
if (topDomain.equals("blogspot.com"))
|
|
||||||
return new Semaphore(8);
|
|
||||||
|
|
||||||
if (topDomain.equals("neocities.org"))
|
|
||||||
return new Semaphore(4);
|
|
||||||
if (topDomain.equals("github.io"))
|
|
||||||
return new Semaphore(4);
|
|
||||||
|
|
||||||
if (topDomain.equals("substack.com")) {
|
|
||||||
return new Semaphore(1);
|
|
||||||
}
|
|
||||||
if (topDomain.endsWith(".edu")) {
|
|
||||||
return new Semaphore(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Semaphore(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class DomainLock implements AutoCloseable {
|
|
||||||
private final String domainName;
|
|
||||||
private final Semaphore semaphore;
|
|
||||||
|
|
||||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
|
||||||
this.domainName = domainName;
|
|
||||||
this.semaphore = semaphore;
|
|
||||||
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName + " [await domain lock]");
|
|
||||||
semaphore.acquire();
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() {
|
|
||||||
semaphore.release();
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName + " [wrapping up]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@@ -1,12 +1,12 @@
|
|||||||
package nu.marginalia.rss.svc;
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
|
||||||
import com.apptasticsoftware.rssreader.RssReader;
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
|
import nu.marginalia.coordination.DomainCoordinator;
|
||||||
|
import nu.marginalia.coordination.DomainLock;
|
||||||
import nu.marginalia.executor.client.ExecutorClient;
|
import nu.marginalia.executor.client.ExecutorClient;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
@@ -20,23 +20,39 @@ import nu.marginalia.storage.FileStorageService;
|
|||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||||
|
import org.apache.hc.client5.http.config.RequestConfig;
|
||||||
|
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
|
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||||
|
import org.apache.hc.core5.http.Header;
|
||||||
|
import org.apache.hc.core5.http.HeaderElement;
|
||||||
|
import org.apache.hc.core5.http.HeaderElements;
|
||||||
|
import org.apache.hc.core5.http.HttpResponse;
|
||||||
|
import org.apache.hc.core5.http.io.SocketConfig;
|
||||||
|
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||||
|
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||||
|
import org.apache.hc.core5.http.message.MessageSupport;
|
||||||
|
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||||
|
import org.apache.hc.core5.util.TimeValue;
|
||||||
|
import org.apache.hc.core5.util.Timeout;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
|
||||||
import java.net.http.HttpRequest;
|
|
||||||
import java.net.http.HttpResponse;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.*;
|
import java.time.Instant;
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.time.ZoneId;
|
||||||
|
import java.time.ZonedDateTime;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@@ -48,20 +64,21 @@ public class FeedFetcherService {
|
|||||||
private static final int MAX_FEED_ITEMS = 10;
|
private static final int MAX_FEED_ITEMS = 10;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
||||||
|
|
||||||
private final RssReader rssReader = new RssReader();
|
|
||||||
|
|
||||||
private final FeedDb feedDb;
|
private final FeedDb feedDb;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final NodeConfigurationService nodeConfigurationService;
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
private final ServiceHeartbeat serviceHeartbeat;
|
private final ServiceHeartbeat serviceHeartbeat;
|
||||||
private final ExecutorClient executorClient;
|
private final ExecutorClient executorClient;
|
||||||
|
|
||||||
private final DomainLocks domainLocks = new DomainLocks();
|
private final DomainCoordinator domainCoordinator;
|
||||||
|
|
||||||
|
private final HttpClient httpClient;
|
||||||
|
|
||||||
private volatile boolean updating;
|
private volatile boolean updating;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeedFetcherService(FeedDb feedDb,
|
public FeedFetcherService(FeedDb feedDb,
|
||||||
|
DomainCoordinator domainCoordinator,
|
||||||
FileStorageService fileStorageService,
|
FileStorageService fileStorageService,
|
||||||
NodeConfigurationService nodeConfigurationService,
|
NodeConfigurationService nodeConfigurationService,
|
||||||
ServiceHeartbeat serviceHeartbeat,
|
ServiceHeartbeat serviceHeartbeat,
|
||||||
@@ -72,23 +89,90 @@ public class FeedFetcherService {
|
|||||||
this.nodeConfigurationService = nodeConfigurationService;
|
this.nodeConfigurationService = nodeConfigurationService;
|
||||||
this.serviceHeartbeat = serviceHeartbeat;
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
this.executorClient = executorClient;
|
this.executorClient = executorClient;
|
||||||
|
this.domainCoordinator = domainCoordinator;
|
||||||
|
|
||||||
|
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||||
|
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||||
|
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||||
|
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
|
||||||
// Add support for some alternate date tags for atom
|
var connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||||
rssReader.addItemExtension("issued", this::setDateFallback);
|
.setMaxConnPerRoute(2)
|
||||||
rssReader.addItemExtension("created", this::setDateFallback);
|
.setMaxConnTotal(50)
|
||||||
}
|
.setDefaultConnectionConfig(connectionConfig)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||||
|
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||||
|
.setSoTimeout(Timeout.ofSeconds(10))
|
||||||
|
.build()
|
||||||
|
);
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon(true).start(() -> {
|
||||||
|
try {
|
||||||
|
for (;;) {
|
||||||
|
TimeUnit.SECONDS.sleep(15);
|
||||||
|
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||||
|
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||||
|
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||||
|
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
httpClient = HttpClients.custom()
|
||||||
|
.setDefaultRequestConfig(defaultRequestConfig)
|
||||||
|
.setConnectionManager(connectionManager)
|
||||||
|
.setUserAgent(WmsaHome.getUserAgent().uaIdentifier())
|
||||||
|
.setConnectionManager(connectionManager)
|
||||||
|
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||||
|
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||||
|
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||||
|
//
|
||||||
|
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||||
|
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||||
|
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||||
|
|
||||||
|
while (it.hasNext()) {
|
||||||
|
final HeaderElement he = it.next();
|
||||||
|
final String param = he.getName();
|
||||||
|
final String value = he.getValue();
|
||||||
|
|
||||||
|
if (value == null)
|
||||||
|
continue;
|
||||||
|
if (!"timeout".equalsIgnoreCase(param))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
long timeout = Long.parseLong(value);
|
||||||
|
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||||
|
return TimeValue.ofSeconds(timeout);
|
||||||
|
} catch (final NumberFormatException ignore) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return defaultValue;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.build();
|
||||||
|
|
||||||
private void setDateFallback(Item item, String value) {
|
|
||||||
if (item.getPubDate().isEmpty()) {
|
|
||||||
item.setPubDate(value);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
CLEAN,
|
CLEAN,
|
||||||
REFRESH
|
REFRESH
|
||||||
};
|
}
|
||||||
|
|
||||||
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
||||||
if (updating) // Prevent concurrent updates
|
if (updating) // Prevent concurrent updates
|
||||||
@@ -96,13 +180,10 @@ public class FeedFetcherService {
|
|||||||
throw new IllegalStateException("Already updating feeds, refusing to start another update");
|
throw new IllegalStateException("Already updating feeds, refusing to start another update");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
try (FeedDbWriter writer = feedDb.createWriter();
|
try (FeedDbWriter writer = feedDb.createWriter();
|
||||||
HttpClient client = HttpClient.newBuilder()
|
ExecutorService fetchExecutor = Executors.newVirtualThreadPerTaskExecutor();
|
||||||
.connectTimeout(Duration.ofSeconds(15))
|
FeedJournal feedJournal = FeedJournal.create();
|
||||||
.executor(Executors.newCachedThreadPool())
|
|
||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
|
||||||
.version(HttpClient.Version.HTTP_2)
|
|
||||||
.build();
|
|
||||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||||
) {
|
) {
|
||||||
updating = true;
|
updating = true;
|
||||||
@@ -145,8 +226,9 @@ public class FeedFetcherService {
|
|||||||
};
|
};
|
||||||
|
|
||||||
FetchResult feedData;
|
FetchResult feedData;
|
||||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
try (DomainLock domainLock = domainCoordinator.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||||
feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
|
feedData = fetchFeedData(feed, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||||
|
TimeUnit.SECONDS.sleep(1); // Sleep before we yield the lock to avoid hammering the server from multiple processes
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
feedData = new FetchResult.TransientError();
|
feedData = new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
@@ -155,6 +237,8 @@ public class FeedFetcherService {
|
|||||||
case FetchResult.Success(String value, String etag) -> {
|
case FetchResult.Success(String value, String etag) -> {
|
||||||
writer.saveEtag(feed.domain(), etag);
|
writer.saveEtag(feed.domain(), etag);
|
||||||
writer.saveFeed(parseFeed(value, feed));
|
writer.saveFeed(parseFeed(value, feed));
|
||||||
|
|
||||||
|
feedJournal.record(feed.feedUrl(), value);
|
||||||
}
|
}
|
||||||
case FetchResult.NotModified() -> {
|
case FetchResult.NotModified() -> {
|
||||||
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
||||||
@@ -223,57 +307,70 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||||
HttpClient client,
|
ExecutorService executorService,
|
||||||
@Nullable String ifModifiedSinceDate,
|
@Nullable String ifModifiedSinceDate,
|
||||||
@Nullable String ifNoneMatchTag)
|
@Nullable String ifNoneMatchTag)
|
||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
URI uri = new URI(feed.feedUrl());
|
URI uri = new URI(feed.feedUrl());
|
||||||
|
|
||||||
HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
|
var requestBuilder = ClassicRequestBuilder.get(uri)
|
||||||
.GET()
|
.setHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||||
.uri(uri)
|
.setHeader("Accept-Encoding", "gzip")
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
.setHeader("Accept", "text/*, */*;q=0.9");
|
||||||
.header("Accept-Encoding", "gzip")
|
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
|
||||||
.timeout(Duration.ofSeconds(15))
|
|
||||||
;
|
|
||||||
|
|
||||||
if (ifModifiedSinceDate != null) {
|
|
||||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||||
|
// though since there are certain idiosyncrasies in server implementations,
|
||||||
|
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||||
if (ifNoneMatchTag != null) {
|
if (ifNoneMatchTag != null) {
|
||||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
requestBuilder.addHeader("If-None-Match", ifNoneMatchTag);
|
||||||
|
} else if (ifModifiedSinceDate != null) {
|
||||||
|
requestBuilder.addHeader("If-Modified-Since", ifModifiedSinceDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
HttpRequest getRequest = requestBuilder.build();
|
return httpClient.execute(requestBuilder.build(), rsp -> {
|
||||||
|
try {
|
||||||
|
logger.info("Code: {}, URL: {}", rsp.getCode(), uri);
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
switch (rsp.getCode()) {
|
||||||
HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
case 200 -> {
|
||||||
|
if (rsp.getEntity() == null) {
|
||||||
|
return new FetchResult.TransientError(); // No content to read, treat as transient error
|
||||||
|
}
|
||||||
|
byte[] responseData = EntityUtils.toByteArray(rsp.getEntity());
|
||||||
|
|
||||||
if (rs.statusCode() == 429) { // Too Many Requests
|
// Decode the response body based on the Content-Type header
|
||||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
Header contentTypeHeader = rsp.getFirstHeader("Content-Type");
|
||||||
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
if (contentTypeHeader == null) {
|
||||||
continue;
|
return new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
|
String contentType = contentTypeHeader.getValue();
|
||||||
|
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||||
|
|
||||||
String newEtagValue = rs.headers().firstValue("ETag").orElse("");
|
// Grab the ETag header if it exists
|
||||||
|
Header etagHeader = rsp.getFirstHeader("ETag");
|
||||||
|
String newEtagValue = etagHeader == null ? null : etagHeader.getValue();
|
||||||
|
|
||||||
return switch (rs.statusCode()) {
|
return new FetchResult.Success(bodyText, newEtagValue);
|
||||||
case 200 -> {
|
}
|
||||||
byte[] responseData = getResponseData(rs);
|
case 304 -> {
|
||||||
|
return new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||||
String contentType = rs.headers().firstValue("Content-Type").orElse("");
|
}
|
||||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
case 404 -> {
|
||||||
|
return new FetchResult.PermanentError(); // never try again
|
||||||
yield new FetchResult.Success(bodyText, newEtagValue);
|
}
|
||||||
|
default -> {
|
||||||
|
return new FetchResult.TransientError(); // we try again later
|
||||||
|
}
|
||||||
}
|
}
|
||||||
case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
|
}
|
||||||
case 404 -> new FetchResult.PermanentError(); // never try again
|
catch (Exception ex) {
|
||||||
default -> new FetchResult.TransientError(); // we try again later
|
return new FetchResult.PermanentError(); // treat as permanent error
|
||||||
};
|
}
|
||||||
}
|
finally {
|
||||||
|
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.debug("Error fetching feed", ex);
|
logger.debug("Error fetching feed", ex);
|
||||||
@@ -282,19 +379,6 @@ public class FeedFetcherService {
|
|||||||
return new FetchResult.TransientError();
|
return new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
|
||||||
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
|
||||||
|
|
||||||
if ("gzip".equals(encoding)) {
|
|
||||||
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
|
||||||
return stream.readAllBytes();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return response.body();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public sealed interface FetchResult {
|
public sealed interface FetchResult {
|
||||||
record Success(String value, String etag) implements FetchResult {}
|
record Success(String value, String etag) implements FetchResult {}
|
||||||
record NotModified() implements FetchResult {}
|
record NotModified() implements FetchResult {}
|
||||||
@@ -367,12 +451,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||||
try {
|
try {
|
||||||
feedData = sanitizeEntities(feedData);
|
List<SimpleFeedParser.ItemData> rawItems = SimpleFeedParser.parse(feedData);
|
||||||
|
|
||||||
List<Item> rawItems = rssReader.read(
|
|
||||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
|
||||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
|
||||||
).toList();
|
|
||||||
|
|
||||||
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
||||||
|
|
||||||
@@ -395,33 +474,6 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
|
||||||
"»", "»",
|
|
||||||
"«", "«",
|
|
||||||
"—", "--",
|
|
||||||
"–", "-",
|
|
||||||
"’", "'",
|
|
||||||
"‘", "'",
|
|
||||||
""", "\"",
|
|
||||||
" ", ""
|
|
||||||
);
|
|
||||||
|
|
||||||
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
|
||||||
* which is unfortunately relatively common. Replace them as far as is possible
|
|
||||||
* with their corresponding characters
|
|
||||||
*/
|
|
||||||
static String sanitizeEntities(String feedData) {
|
|
||||||
String result = feedData;
|
|
||||||
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
|
||||||
result = result.replace(entry.getKey(), entry.getValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle lone ampersands not part of a recognized XML entity
|
|
||||||
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Decide whether to keep URI fragments in the feed items.
|
/** Decide whether to keep URI fragments in the feed items.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* We keep fragments if there are multiple different fragments in the items.
|
* We keep fragments if there are multiple different fragments in the items.
|
||||||
@@ -429,16 +481,16 @@ public class FeedFetcherService {
|
|||||||
* @param items The items to check
|
* @param items The items to check
|
||||||
* @return True if we should keep the fragments, false otherwise
|
* @return True if we should keep the fragments, false otherwise
|
||||||
*/
|
*/
|
||||||
private boolean areFragmentsDisparate(List<Item> items) {
|
private boolean areFragmentsDisparate(List<SimpleFeedParser.ItemData> items) {
|
||||||
Set<String> seenFragments = new HashSet<>();
|
Set<String> seenFragments = new HashSet<>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (var item : items) {
|
for (var item : items) {
|
||||||
if (item.getLink().isEmpty()) {
|
if (item.url().isBlank()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var link = item.getLink().get();
|
var link = item.url();
|
||||||
if (!link.contains("#")) {
|
if (!link.contains("#")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,76 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.slop.SlopTable;
|
||||||
|
import nu.marginalia.slop.column.string.StringColumn;
|
||||||
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
|
||||||
|
/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
|
||||||
|
*/
|
||||||
|
public interface FeedJournal extends AutoCloseable {
|
||||||
|
StringColumn urlColumn = new StringColumn("url");
|
||||||
|
StringColumn contentsColumn = new StringColumn("contents", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||||
|
|
||||||
|
void record(String url, String contents) throws IOException;
|
||||||
|
void close() throws IOException;
|
||||||
|
|
||||||
|
|
||||||
|
static FeedJournal create() throws IOException {
|
||||||
|
if (Boolean.getBoolean("feedFetcher.persistJournal")) {
|
||||||
|
Path journalPath = WmsaHome.getDataPath().resolve("feed-journal");
|
||||||
|
if (Files.isDirectory(journalPath)) {
|
||||||
|
FileUtils.deleteDirectory(journalPath.toFile());
|
||||||
|
}
|
||||||
|
Files.createDirectories(journalPath);
|
||||||
|
return new RecordingFeedJournal(journalPath);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return new NoOpFeedJournal();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class NoOpFeedJournal implements FeedJournal {
|
||||||
|
@Override
|
||||||
|
public void record(String url, String contents) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {}
|
||||||
|
}
|
||||||
|
|
||||||
|
class RecordingFeedJournal extends SlopTable implements FeedJournal {
|
||||||
|
|
||||||
|
private final StringColumn.Writer urlWriter;
|
||||||
|
private final StringColumn.Writer contentsWriter;
|
||||||
|
|
||||||
|
public RecordingFeedJournal(Path path) throws IOException {
|
||||||
|
super(path, SlopTable.getNumPages(path, FeedJournal.urlColumn));
|
||||||
|
|
||||||
|
urlWriter = urlColumn.create(this);
|
||||||
|
contentsWriter = contentsColumn.create(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void record(String url, String contents) throws IOException {
|
||||||
|
urlWriter.put(url);
|
||||||
|
contentsWriter.put(contents);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void replay(Path journalPath, BiConsumer<String, String> urlAndContent) throws IOException {
|
||||||
|
try (SlopTable table = new SlopTable(journalPath)) {
|
||||||
|
final StringColumn.Reader urlReader = urlColumn.open(table);
|
||||||
|
final StringColumn.Reader contentsReader = contentsColumn.open(table);
|
||||||
|
|
||||||
|
while (urlReader.hasRemaining()) {
|
||||||
|
urlAndContent.accept(urlReader.get(), contentsReader.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,102 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import com.apptasticsoftware.rssreader.DateTimeParser;
|
||||||
|
import com.apptasticsoftware.rssreader.util.Default;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.parser.Parser;
|
||||||
|
|
||||||
|
import java.time.ZonedDateTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class SimpleFeedParser {
|
||||||
|
|
||||||
|
private static final DateTimeParser dateTimeParser = Default.getDateTimeParser();
|
||||||
|
|
||||||
|
public record ItemData (
|
||||||
|
String title,
|
||||||
|
String description,
|
||||||
|
String url,
|
||||||
|
String pubDate
|
||||||
|
) {
|
||||||
|
public boolean isWellFormed() {
|
||||||
|
return title != null && !title.isBlank() &&
|
||||||
|
description != null && !description.isBlank() &&
|
||||||
|
url != null && !url.isBlank() &&
|
||||||
|
pubDate != null && !pubDate.isBlank();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<ZonedDateTime> getPubDateZonedDateTime() {
|
||||||
|
try {
|
||||||
|
return Optional.ofNullable(dateTimeParser.parse(pubDate()));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<ItemData> parse(String content) {
|
||||||
|
var doc = Jsoup.parse(content, Parser.xmlParser());
|
||||||
|
List<ItemData> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
doc.select("item, entry").forEach(element -> {
|
||||||
|
String link = "";
|
||||||
|
String title = "";
|
||||||
|
String description = "";
|
||||||
|
String pubDate = "";
|
||||||
|
|
||||||
|
for (String attr : List.of("title", "dc:title")) {
|
||||||
|
if (!title.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
title = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("title", "summary", "content", "description", "dc:description")) {
|
||||||
|
if (!description.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
description = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("pubDate", "published", "updated", "issued", "created", "dc:date")) {
|
||||||
|
if (!pubDate.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
pubDate = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("link", "url")) {
|
||||||
|
if (!link.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
|
||||||
|
if (tag != null) {
|
||||||
|
String linkText = tag.text();
|
||||||
|
|
||||||
|
if (linkText.isBlank()) {
|
||||||
|
linkText = tag.attr("href");
|
||||||
|
}
|
||||||
|
|
||||||
|
link = linkText;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.add(new ItemData(title, description, link, pubDate));
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
27
code/functions/live-capture/test-resources/nlnet.atom
Normal file
27
code/functions/live-capture/test-resources/nlnet.atom
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
<feed xmlns="http://www.w3.org/2005/Atom" xml:base="https://nlnet.nl">
|
||||||
|
<title type="text">NLnet news</title>
|
||||||
|
<updated>2025-01-01T00:00:00Z</updated>
|
||||||
|
<id>https://nlnet.nl/feed.atom</id>
|
||||||
|
<link rel="self" type="application/atom+xml" href="https://nlnet.nl/feed.atom"/>
|
||||||
|
<entry>
|
||||||
|
<id>https://nlnet.nl/news/2025/20250101-announcing-grantees-June-call.html</id>
|
||||||
|
<author>
|
||||||
|
<name>NLnet</name>
|
||||||
|
</author>
|
||||||
|
<title type="xhtml">
|
||||||
|
<div xmlns="http://www.w3.org/1999/xhtml">50 Free and Open Source Projects Selected for NGI Zero grants</div>
|
||||||
|
</title>
|
||||||
|
<link href="/news/2025/20250101-announcing-grantees-June-call.html"/>
|
||||||
|
<updated>2025-01-01T00:00:00Z</updated>
|
||||||
|
<content type="xhtml">
|
||||||
|
<div xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<p class="paralead">Happy 2025 everyone! On this first day of the fresh new year we are happy to announce 50 project teams were selected to receive NGI Zero grants. We are welcoming projects from 18 countries involving people and organisations of various types: individuals, associations, small and medium enterprises, foundations, universities, and informal collectives. The new projects are all across the different layers of the NGI technology stack: from trustworthy open hardware to services & applications which provide autonomy for end-users.</p>
|
||||||
|
<p>The 50 free and open source projects were selected across two funds. 19 teams will receive grants from the <a href="/commonsfund/">NGI Zero Commons Fund</a>, a broadly themed fund that supports people working on reclaiming the public nature of the internet. The other 31 projects will work within <a href="/core/">NGI Zero Core</a> which focuses on strengthening the open internet architecture. Both funds offer financial and practical support. The latter consisting of <a href="/NGI0/services/">support services</a> such as accessibility and security audits, advice on license compliance, help with testing, documentation or UX design.</p>
|
||||||
|
<h2>If you applied for a grant</h2>
|
||||||
|
<p>This is the selection for the <a href="https://nlnet.nl/news/2024/20240401-call.html">June call</a>. We always inform <em>all</em> applicants about the outcome of the review ahead of the public announcement, if the are selected or not. If you have not heard anything, you probably applied to a later call that is still under review. You can see which call you applied to by checking the application number assigned to the project when you applied. The second number in the sequence refers to the month of the call, so 06 in the case of the June call. (It should not happen, but if you did apply to the June call and did not hear anything, do contact us.)</p>
|
||||||
|
<h2>Meet the new projects!</h2>
|
||||||
|
</div>
|
||||||
|
</content>
|
||||||
|
</entry>
|
||||||
|
|
||||||
|
</feed>
|
@@ -0,0 +1,113 @@
|
|||||||
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class DomSampleDbTest {
|
||||||
|
Path tempDir;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws Exception {
|
||||||
|
tempDir = Files.createTempDirectory("test");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSetUp() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to set up database: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSyncDomains() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||||
|
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com"));
|
||||||
|
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
db.syncDomains(Set.of("foobar.com", "test.com"));
|
||||||
|
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to sync domains: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFetchDomains() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||||
|
db.flagDomainAsFetched("example.com");
|
||||||
|
db.flagDomainAsFetched("test.com");
|
||||||
|
db.flagDomainAsFetched("foobar.com");
|
||||||
|
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
|
||||||
|
db.flagDomainAsFetched("test.com");
|
||||||
|
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to sync domains: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void saveLoadSingle() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
|
||||||
|
var samples = db.getSamples("example.com");
|
||||||
|
assertEquals(1, samples.size());
|
||||||
|
var sample = samples.getFirst();
|
||||||
|
assertEquals("example.com", sample.domain());
|
||||||
|
assertEquals("http://example.com/sample", sample.url());
|
||||||
|
assertEquals("sample data", sample.sample());
|
||||||
|
assertEquals("requests data", sample.requests());
|
||||||
|
assertTrue(sample.acceptedPopover());
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to save/load sample: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void saveLoadTwo() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
|
||||||
|
var samples = db.getSamples("example.com");
|
||||||
|
assertEquals(2, samples.size());
|
||||||
|
|
||||||
|
Map<String, String> samplesByUrl = new HashMap<>();
|
||||||
|
for (var sample : samples) {
|
||||||
|
samplesByUrl.put(sample.url(), sample.sample());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
|
||||||
|
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to save/load sample: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -1,36 +1,137 @@
|
|||||||
package nu.marginalia.livecapture;
|
package nu.marginalia.livecapture;
|
||||||
|
|
||||||
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
|
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.testcontainers.containers.GenericContainer;
|
import org.testcontainers.containers.GenericContainer;
|
||||||
|
import org.testcontainers.images.PullPolicy;
|
||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.DockerImageName;
|
import org.testcontainers.utility.DockerImageName;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||||
|
|
||||||
|
|
||||||
@Testcontainers
|
@Testcontainers
|
||||||
|
@Tag("slow")
|
||||||
public class BrowserlessClientTest {
|
public class BrowserlessClientTest {
|
||||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
|
// Run gradle docker if this image is not available
|
||||||
|
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
|
||||||
|
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||||
|
.withImagePullPolicy(PullPolicy.defaultPolicy())
|
||||||
|
.withNetworkMode("bridge")
|
||||||
|
.withLogConsumer(frame -> {
|
||||||
|
System.out.print(frame.getUtf8String());
|
||||||
|
})
|
||||||
|
.withExposedPorts(3000);
|
||||||
|
|
||||||
|
static WireMockServer wireMockServer =
|
||||||
|
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||||
|
.port(18089));
|
||||||
|
|
||||||
|
static String localIp;
|
||||||
|
|
||||||
|
static URI browserlessURI;
|
||||||
|
static URI browserlessWssURI;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setup() {
|
public static void setup() throws IOException {
|
||||||
container.start();
|
container.start();
|
||||||
|
|
||||||
|
browserlessURI = URI.create(String.format("http://%s:%d/",
|
||||||
|
container.getHost(),
|
||||||
|
container.getMappedPort(3000))
|
||||||
|
);
|
||||||
|
|
||||||
|
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
|
||||||
|
container.getHost(),
|
||||||
|
container.getMappedPort(3000))
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
wireMockServer.start();
|
||||||
|
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||||
|
|
||||||
|
localIp = ServiceConfigurationModule.getLocalNetworkIP();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testInspectContentUA__Flaky() throws Exception {
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
client.content("http://" + localIp + ":18089/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testInspectScreenshotUA__Flaky() throws Exception {
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
client.screenshot("http://" + localIp + ":18089/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues(),
|
||||||
|
BrowserlessClient.ScreenshotOptions.defaultValues()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testContent() throws Exception {
|
public void testContent() throws Exception {
|
||||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues());
|
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||||
Assertions.assertNotNull(content, "Content should not be null");
|
|
||||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAnnotatedContent() throws Exception {
|
||||||
|
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI);
|
||||||
|
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||||
|
) {
|
||||||
|
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||||
|
dbop.saveSample("marginalia.nu", "https://marginalia.nu/", content);
|
||||||
|
System.out.println(content);
|
||||||
|
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||||
|
|
||||||
|
dbop.getSamples("marginalia.nu").forEach(sample -> {
|
||||||
|
System.out.println("Sample URL: " + sample.url());
|
||||||
|
System.out.println("Sample Content: " + sample.sample());
|
||||||
|
System.out.println("Sample Requests: " + sample.requests());
|
||||||
|
System.out.println("Accepted Popover: " + sample.acceptedPopover());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testScreenshot() throws Exception {
|
public void testScreenshot() throws Exception {
|
||||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
|
var screenshot = client.screenshot("https://www.marginalia.nu/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues(),
|
||||||
|
BrowserlessClient.ScreenshotOptions.defaultValues());
|
||||||
|
|
||||||
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
|
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -5,6 +5,8 @@ import com.google.inject.Guice;
|
|||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
import com.zaxxer.hikari.HikariConfig;
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.coordination.DomainCoordinator;
|
||||||
|
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.rss.db.FeedDb;
|
import nu.marginalia.rss.db.FeedDb;
|
||||||
import nu.marginalia.rss.model.FeedItems;
|
import nu.marginalia.rss.model.FeedItems;
|
||||||
@@ -82,6 +84,7 @@ class FeedFetcherServiceTest extends AbstractModule {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void configure() {
|
public void configure() {
|
||||||
|
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
|
||||||
bind(HikariDataSource.class).toInstance(dataSource);
|
bind(HikariDataSource.class).toInstance(dataSource);
|
||||||
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
|
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
|
||||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
|
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
|
||||||
|
@@ -1,30 +0,0 @@
|
|||||||
package nu.marginalia.rss.svc;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
public class TestXmlSanitization {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testPreservedEntities() {
|
|
||||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
|
||||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
|
||||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
|
||||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testStrayAmpersand() {
|
|
||||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTranslatedHtmlEntity() {
|
|
||||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTranslatedHtmlEntityQuot() {
|
|
||||||
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob""));
|
|
||||||
}
|
|
||||||
}
|
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user