mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
374 Commits
deploy-003
...
deploy-024
Author | SHA1 | Date | |
---|---|---|---|
|
0a0e88fd6e | ||
|
b4fc0c4368 | ||
|
87ee8765b8 | ||
|
1adf4835fa | ||
|
b7b5d0bf46 | ||
|
416059adde | ||
|
db7930016a | ||
|
82456ad673 | ||
|
0882a6d9cd | ||
|
5020029c2d | ||
|
ac44d0b093 | ||
|
4b32b9b10e | ||
|
9f041d6631 | ||
|
13fb1efce4 | ||
|
c1225165b7 | ||
|
67ad7a3bbc | ||
|
ed62ec8a35 | ||
|
42b24cfa34 | ||
|
1ffaab2da6 | ||
|
5f93c7f767 | ||
|
4001c68c82 | ||
|
6b811489c5 | ||
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 | ||
|
9afed0a18e | ||
|
afad4deb94 | ||
|
f071c947e4 | ||
|
79996c9348 | ||
|
db907ab06a | ||
|
c49cd9dd95 | ||
|
eec9df3b0a | ||
|
e5f3288de6 | ||
|
d587544d3a | ||
|
1a9ae1bc40 | ||
|
e0c81e956a | ||
|
542fb12b38 | ||
|
65ec734566 | ||
|
10b6a25c63 | ||
|
6260f6bec7 | ||
|
d6d5467696 | ||
|
034560ca75 | ||
|
e994fddae4 | ||
|
345f01f306 | ||
|
5a8e286689 | ||
|
39a055aa94 | ||
|
37aaa90dc9 | ||
|
24022c5adc | ||
|
1de9ecc0b6 | ||
|
9b80245ea0 | ||
|
4e1595c1a6 | ||
|
0be8585fa5 | ||
|
a0fe070fe7 | ||
|
abe9da0fc6 | ||
|
56d0128b0a | ||
|
840b68ac55 | ||
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
be039d1a8c | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 | ||
|
599534806b | ||
|
7e8253dac7 | ||
|
97a6780ea3 | ||
|
eb634beec8 | ||
|
269ebd1654 | ||
|
39ce40bfeb | ||
|
c187b2e1c1 | ||
|
42eaa4588b | ||
|
4f40a5fbeb | ||
|
3f3d42bc01 | ||
|
61c8d53e1b | ||
|
a7a3d85be9 | ||
|
306232fb54 | ||
|
5aef844f0d | ||
|
d56b5c828a | ||
|
ab58a4636f | ||
|
00be269238 | ||
|
879e6a9424 | ||
|
fba3455732 | ||
|
14283da7f5 | ||
|
93df4d1fc0 | ||
|
b12a0b998c | ||
|
3b6f4e321b | ||
|
8428111771 | ||
|
e9fd4415ef | ||
|
4c95c3dcad | ||
|
c5281536fb | ||
|
4431dae7ac | ||
|
4df4d0a7a8 | ||
|
9f05083b94 | ||
|
fc92e9b9c0 | ||
|
328fb5d927 | ||
|
36889950e8 | ||
|
c96a94878b | ||
|
1c57d7d73a | ||
|
a443d22356 | ||
|
aa59d4afa4 | ||
|
df0f18d0e7 | ||
|
0819d46f97 | ||
|
5e2b63473e | ||
|
f9590703f1 | ||
|
f12fc11337 | ||
|
c309030184 | ||
|
fd5af01629 | ||
|
d4c43c7a79 | ||
|
18700e1919 | ||
|
120b431998 | ||
|
71dad99326 | ||
|
c1e8afdf86 | ||
|
fa32dddc24 | ||
|
a266fcbf30 | ||
|
6e47e58e0e | ||
|
9dc43d8b4a | ||
|
83967e3305 | ||
|
4db980a291 | ||
|
089b177868 | ||
|
9c8e9a68d5 | ||
|
413d5cc788 | ||
|
58539b92ac | ||
|
fe72f16df1 | ||
|
b49a244a2e | ||
|
3f0b4c010f | ||
|
c6e0cd93f7 | ||
|
80a7ccb080 | ||
|
54dec347c4 | ||
|
d6ee3f0785 | ||
|
8be88afcf3 | ||
|
0e3c00d3e1 | ||
|
4279a7f1aa | ||
|
251006d4f9 | ||
|
c3e99dc12a | ||
|
aaaa2de022 | ||
|
fc1388422a | ||
|
b07080db16 | ||
|
e9d86dca4a | ||
|
1d693f0efa | ||
|
5874a163dc | ||
|
5ec7a1deab | ||
|
7fea2808ed | ||
|
8da74484f0 | ||
|
923d5a7234 | ||
|
58f88749b8 | ||
|
77f727a5ba | ||
|
667cfb53dc | ||
|
fe36d4ed20 | ||
|
acf4bef98d | ||
|
2a737c34bb | ||
|
90a577af82 | ||
|
f0c9b935d8 | ||
|
7b5493dd51 | ||
|
c246a59158 | ||
|
0b99781d24 | ||
|
39db9620c1 | ||
|
1781599363 | ||
|
6b2d18fb9b | ||
|
59b1d200ab | ||
|
897010a2cf | ||
|
602af7a77e | ||
|
a7d91c8527 | ||
|
7151602124 | ||
|
884e33bd4a | ||
|
e84d5c497a | ||
|
2d2d3e2466 | ||
|
647dd9b12f | ||
|
de4e2849ce | ||
|
3c43f1954e | ||
|
fa2462ec39 | ||
|
f4ad7145db | ||
|
068b450180 | ||
|
05b909a21f | ||
|
3d179cddce | ||
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce | ||
|
500f63e921 | ||
|
6dfbedda1e | ||
|
9715ddb105 | ||
|
1fc6313a77 | ||
|
b1249d5b8a | ||
|
ef95d59b07 | ||
|
acdd8664f5 | ||
|
6b12eac58a | ||
|
bb3f1f395a | ||
|
b661beef41 | ||
|
9888c47f19 | ||
|
dcef7e955b | ||
|
b3973a1dd7 | ||
|
8bd05d6d90 | ||
|
59df8e356e | ||
|
7161162a35 | ||
|
d7c4c5141f | ||
|
88e9b8fb05 | ||
|
b6265cee11 | ||
|
c91af247e9 | ||
|
7a31227de1 | ||
|
4f477604c5 | ||
|
2970f4395b | ||
|
d1ec909b36 | ||
|
c67c5bbf42 | ||
|
ecb0e57a1a | ||
|
8c61f61b46 | ||
|
662a18c933 | ||
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 | ||
|
0f3b24d0f8 | ||
|
a732095d2a | ||
|
6607f0112f | ||
|
4913730de9 | ||
|
1db64f9d56 | ||
|
4dcff14498 | ||
|
426658f64e | ||
|
2181b22f05 | ||
|
42bd79a609 | ||
|
b91c1e528a | ||
|
b1130d7a04 | ||
|
8364bcdc97 | ||
|
626cab5fab | ||
|
cfd4712191 | ||
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca | ||
|
78cc25584a | ||
|
62ba30bacf | ||
|
3bb84eb206 | ||
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a | ||
|
20ca41ec95 | ||
|
7671f0d9e4 | ||
|
44d6bc71b7 | ||
|
9d302e2973 | ||
|
f553701224 | ||
|
f076d05595 | ||
|
b513809710 | ||
|
7519b28e21 | ||
|
3eac4dd57f | ||
|
4c2810720a | ||
|
8480ba8daa | ||
|
fbba392491 | ||
|
530eb35949 | ||
|
c2dd2175a2 | ||
|
b8581b0f56 | ||
|
2ea34767d8 | ||
|
e9af838231 | ||
|
ae0cad47c4 | ||
|
5fbc8ef998 | ||
|
32c6dd9e6a | ||
|
6ece6a6cfb | ||
|
39cd1c18f8 | ||
|
eb65daaa88 | ||
|
0bebdb6e33 | ||
|
1e50e392c6 | ||
|
fb673de370 | ||
|
eee73ab16c | ||
|
5354e034bf | ||
|
72384ad6ca | ||
|
a2b076f9be | ||
|
c8b0a32c0f | ||
|
f0d74aa3bb | ||
|
74a1f100f4 | ||
|
eb049658e4 | ||
|
db138b2a6f | ||
|
1673fc284c | ||
|
503ea57d5b | ||
|
18ca926c7f | ||
|
db99242db2 | ||
|
2b9d2985ba | ||
|
eeb6ecd711 | ||
|
1f58aeadbf | ||
|
3d68be64da | ||
|
668f3b16ef | ||
|
98a340a0d1 | ||
|
8862100f7e | ||
|
274941f6de | ||
|
abec83582d | ||
|
569520c9b6 | ||
|
088310e998 | ||
|
270cab874b | ||
|
4c74e280d3 | ||
|
5b347e17ac | ||
|
55d6ab933f | ||
|
43b74e9706 | ||
|
579a115243 | ||
|
2c67f50a43 | ||
|
78a958e2b0 | ||
|
4e939389b2 | ||
|
e67a9bdb91 | ||
|
567e4e1237 | ||
|
4342e42722 | ||
|
bc818056e6 | ||
|
de2feac238 | ||
|
1e770205a5 | ||
|
e44ecd6d69 | ||
|
5b93a0e633 | ||
|
08fb0e5efe | ||
|
bcf67782ea | ||
|
ef3f175ede | ||
|
bbe4b5d9fd | ||
|
c67a635103 | ||
|
20b24133fb | ||
|
f2567677e8 | ||
|
bc2c2061f2 | ||
|
1c7f5a31a5 | ||
|
59a8ea60f7 | ||
|
aa9b1244ea | ||
|
2d17233366 | ||
|
b245cc9f38 | ||
|
6614d05bdf | ||
|
55aeb03c4a | ||
|
faa589962f | ||
|
c7edd6b39f | ||
|
79da622e3b | ||
|
3da8337ba6 | ||
|
a32d230f0a | ||
|
3772bfd387 | ||
|
02a7900d1a | ||
|
a1fb92468f | ||
|
b7f0a2a98e | ||
|
5fb76b2e79 | ||
|
ad8c97f342 | ||
|
dc1b6373eb | ||
|
983d6d067c | ||
|
a84a06975c | ||
|
d2864c13ec | ||
|
03ba53ce51 | ||
|
d4a6684931 | ||
|
6f0485287a | ||
|
59e2dd4c26 | ||
|
ca1807caae | ||
|
26c20e18ac | ||
|
7c90b6b414 | ||
|
b63c54c4ce | ||
|
fecd2f4ec3 | ||
|
47e58a21c6 | ||
|
3714104976 | ||
|
f6f036b9b1 | ||
|
b510b7feb8 |
55
ROADMAP.md
55
ROADMAP.md
@@ -1,4 +1,4 @@
|
|||||||
# Roadmap 2024-2025
|
# Roadmap 2025
|
||||||
|
|
||||||
This is a roadmap with major features planned for Marginalia Search.
|
This is a roadmap with major features planned for Marginalia Search.
|
||||||
|
|
||||||
@@ -30,12 +30,6 @@ Retaining the ability to independently crawl the web is still strongly desirable
|
|||||||
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||||
|
|
||||||
## Web Design Overhaul
|
|
||||||
|
|
||||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
|
||||||
|
|
||||||
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
|
||||||
|
|
||||||
## Additional Language Support
|
## Additional Language Support
|
||||||
|
|
||||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||||
@@ -44,14 +38,6 @@ associated with each language added, at least a models file or two, as well as s
|
|||||||
|
|
||||||
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
||||||
|
|
||||||
## Support for binary formats like PDF
|
|
||||||
|
|
||||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
|
||||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
|
||||||
|
|
||||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
|
||||||
that direction as well.
|
|
||||||
|
|
||||||
## Custom ranking logic
|
## Custom ranking logic
|
||||||
|
|
||||||
Stract does an interesting thing where they have configurable search filters.
|
Stract does an interesting thing where they have configurable search filters.
|
||||||
@@ -62,8 +48,39 @@ filter for any API consumer.
|
|||||||
|
|
||||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||||
|
|
||||||
|
## Show favicons next to search results
|
||||||
|
|
||||||
|
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||||
|
|
||||||
|
## Specialized crawler for github
|
||||||
|
|
||||||
|
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||||
|
|
||||||
# Completed
|
# Completed
|
||||||
|
|
||||||
|
## Support for binary formats like PDF (COMPLETED 2025-05)
|
||||||
|
|
||||||
|
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||||
|
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||||
|
|
||||||
|
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||||
|
that direction as well.
|
||||||
|
|
||||||
|
## Web Design Overhaul (COMPLETED 2025-01)
|
||||||
|
|
||||||
|
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||||
|
|
||||||
|
PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127)
|
||||||
|
|
||||||
|
## Finalize RSS support (COMPLETED 2024-11)
|
||||||
|
|
||||||
|
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||||
|
it should be extended to all domains. It would also be interesting to offer search of the
|
||||||
|
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||||
|
main dataset.
|
||||||
|
|
||||||
|
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||||
|
|
||||||
## Proper Position Index (COMPLETED 2024-09)
|
## Proper Position Index (COMPLETED 2024-09)
|
||||||
|
|
||||||
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
||||||
@@ -76,11 +93,3 @@ list, as is the civilized way of doing this.
|
|||||||
|
|
||||||
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||||
|
|
||||||
## Finalize RSS support (COMPLETED 2024-11)
|
|
||||||
|
|
||||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
|
||||||
it should be extended to all domains. It would also be interesting to offer search of the
|
|
||||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
|
||||||
main dataset.
|
|
||||||
|
|
||||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
|
||||||
|
@@ -5,7 +5,7 @@ plugins {
|
|||||||
|
|
||||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||||
id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
|
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
group 'marginalia'
|
group 'marginalia'
|
||||||
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
|
|||||||
}
|
}
|
||||||
|
|
||||||
ext {
|
ext {
|
||||||
jvmVersion=23
|
jvmVersion = 24
|
||||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||||
dockerImageTag='latest'
|
dockerImageTag='latest'
|
||||||
dockerImageRegistry='marginalia'
|
dockerImageRegistry='marginalia'
|
||||||
jibVersion = '3.4.3'
|
jibVersion = '3.4.5'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
idea {
|
idea {
|
||||||
|
@@ -24,58 +24,4 @@ public class LanguageModels {
|
|||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||||
this.segments = segments;
|
this.segments = segments;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static LanguageModelsBuilder builder() {
|
|
||||||
return new LanguageModelsBuilder();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class LanguageModelsBuilder {
|
|
||||||
private Path termFrequencies;
|
|
||||||
private Path openNLPSentenceDetectionData;
|
|
||||||
private Path posRules;
|
|
||||||
private Path posDict;
|
|
||||||
private Path fasttextLanguageModel;
|
|
||||||
private Path segments;
|
|
||||||
|
|
||||||
LanguageModelsBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
|
||||||
this.termFrequencies = termFrequencies;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
|
||||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posRules(Path posRules) {
|
|
||||||
this.posRules = posRules;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posDict(Path posDict) {
|
|
||||||
this.posDict = posDict;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
|
||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder segments(Path segments) {
|
|
||||||
this.segments = segments;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModels build() {
|
|
||||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -1,3 +1,8 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A record representing a User Agent.
|
||||||
|
* @param uaString - the header value of the User Agent
|
||||||
|
* @param uaIdentifier - what we look for in robots.txt
|
||||||
|
*/
|
||||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||||
|
@@ -20,7 +20,11 @@ public class DbDomainQueries {
|
|||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||||
|
|
||||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DbDomainQueries(HikariDataSource dataSource)
|
public DbDomainQueries(HikariDataSource dataSource)
|
||||||
@@ -30,16 +34,21 @@ public class DbDomainQueries {
|
|||||||
|
|
||||||
|
|
||||||
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
||||||
try (var connection = dataSource.getConnection()) {
|
try {
|
||||||
|
|
||||||
return domainIdCache.get(domain, () -> {
|
return domainIdCache.get(domain, () -> {
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
try (var connection = dataSource.getConnection();
|
||||||
|
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
|
|
||||||
stmt.setString(1, domain.toString());
|
stmt.setString(1, domain.toString());
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
return rsp.getInt(1);
|
return rsp.getInt(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
throw new NoSuchElementException();
|
throw new NoSuchElementException();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -49,8 +58,33 @@ public class DbDomainQueries {
|
|||||||
catch (ExecutionException ex) {
|
catch (ExecutionException ex) {
|
||||||
throw new RuntimeException(ex.getCause());
|
throw new RuntimeException(ex.getCause());
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
}
|
||||||
throw new RuntimeException(ex);
|
|
||||||
|
|
||||||
|
public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
|
||||||
|
try {
|
||||||
|
return domainWithNodeCache.get(domain, () -> {
|
||||||
|
try (var connection = dataSource.getConnection();
|
||||||
|
var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
|
|
||||||
|
stmt.setString(1, domain.toString());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
catch (UncheckedExecutionException ex) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
catch (ExecutionException ex) {
|
||||||
|
throw new RuntimeException(ex.getCause());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -84,46 +118,62 @@ public class DbDomainQueries {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Optional<EdgeDomain> getDomain(int id) {
|
public Optional<EdgeDomain> getDomain(int id) {
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
|
|
||||||
|
EdgeDomain existing = domainNameCache.getIfPresent(id);
|
||||||
|
if (existing != null) {
|
||||||
|
return Optional.of(existing);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var connection = dataSource.getConnection()) {
|
||||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||||
stmt.setInt(1, id);
|
stmt.setInt(1, id);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
return Optional.of(new EdgeDomain(rsp.getString(1)));
|
var val = new EdgeDomain(rsp.getString(1));
|
||||||
|
domainNameCache.put(id, val);
|
||||||
|
return Optional.of(val);
|
||||||
}
|
}
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (UncheckedExecutionException ex) {
|
|
||||||
throw new RuntimeException(ex.getCause());
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
throw new RuntimeException(ex);
|
throw new RuntimeException(ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<EdgeDomain> otherSubdomains(EdgeDomain domain, int cnt) {
|
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) throws ExecutionException {
|
||||||
List<EdgeDomain> ret = new ArrayList<>();
|
String topDomain = domain.topDomain;
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
return siblingsCache.get(topDomain, () -> {
|
||||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
List<DomainWithNode> ret = new ArrayList<>();
|
||||||
stmt.setString(1, domain.topDomain);
|
|
||||||
stmt.setInt(2, cnt);
|
|
||||||
|
|
||||||
var rs = stmt.executeQuery();
|
try (var conn = dataSource.getConnection();
|
||||||
while (rs.next()) {
|
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
||||||
var sibling = new EdgeDomain(rs.getString(1));
|
stmt.setString(1, topDomain);
|
||||||
|
stmt.setInt(2, cnt);
|
||||||
|
|
||||||
if (sibling.equals(domain))
|
var rs = stmt.executeQuery();
|
||||||
continue;
|
while (rs.next()) {
|
||||||
|
var sibling = new EdgeDomain(rs.getString(1));
|
||||||
|
|
||||||
ret.add(sibling);
|
if (sibling.equals(domain))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ret.add(new DomainWithNode(sibling, rs.getInt(2)));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to get domain neighbors");
|
||||||
}
|
}
|
||||||
} catch (SQLException e) {
|
return ret;
|
||||||
logger.error("Failed to get domain neighbors");
|
});
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
|
||||||
|
public boolean isIndexed() {
|
||||||
|
return nodeAffinity > 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public record DomainIdWithNode (int domainId, int nodeAffinity) { }
|
||||||
}
|
}
|
||||||
|
@@ -1,118 +0,0 @@
|
|||||||
package nu.marginalia.db;
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
|
|
||||||
import java.sql.Connection;
|
|
||||||
import java.sql.PreparedStatement;
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.OptionalInt;
|
|
||||||
|
|
||||||
/** Class used in exporting data. This is intended to be used for a brief time
|
|
||||||
* and then discarded, not kept around as a service.
|
|
||||||
*/
|
|
||||||
public class DbDomainStatsExportMultitool implements AutoCloseable {
|
|
||||||
private final Connection connection;
|
|
||||||
private final int nodeId;
|
|
||||||
private final PreparedStatement knownUrlsQuery;
|
|
||||||
private final PreparedStatement visitedUrlsQuery;
|
|
||||||
private final PreparedStatement goodUrlsQuery;
|
|
||||||
private final PreparedStatement domainNameToId;
|
|
||||||
|
|
||||||
private final PreparedStatement allDomainsQuery;
|
|
||||||
private final PreparedStatement crawlQueueDomains;
|
|
||||||
private final PreparedStatement indexedDomainsQuery;
|
|
||||||
|
|
||||||
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
|
|
||||||
this.connection = dataSource.getConnection();
|
|
||||||
this.nodeId = nodeId;
|
|
||||||
|
|
||||||
knownUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
visitedUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT VISITED_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
goodUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT GOOD_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
domainNameToId = connection.prepareStatement("""
|
|
||||||
SELECT ID
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
allDomainsQuery = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
""");
|
|
||||||
crawlQueueDomains = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM CRAWL_QUEUE
|
|
||||||
""");
|
|
||||||
indexedDomainsQuery = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
WHERE INDEXED > 0
|
|
||||||
""");
|
|
||||||
}
|
|
||||||
|
|
||||||
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
|
|
||||||
return executeNameToIntQuery(domainName, visitedUrlsQuery);
|
|
||||||
}
|
|
||||||
|
|
||||||
public OptionalInt getDomainId(String domainName) throws SQLException {
|
|
||||||
return executeNameToIntQuery(domainName, domainNameToId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getCrawlQueueDomains() throws SQLException {
|
|
||||||
return executeListQuery(crawlQueueDomains, 100);
|
|
||||||
}
|
|
||||||
public List<String> getAllIndexedDomains() throws SQLException {
|
|
||||||
return executeListQuery(indexedDomainsQuery, 100_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
|
|
||||||
throws SQLException {
|
|
||||||
statement.setString(1, domainName);
|
|
||||||
var rs = statement.executeQuery();
|
|
||||||
|
|
||||||
if (rs.next()) {
|
|
||||||
return OptionalInt.of(rs.getInt(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
return OptionalInt.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
|
|
||||||
List<String> ret = new ArrayList<>(sizeHint);
|
|
||||||
|
|
||||||
var rs = statement.executeQuery();
|
|
||||||
|
|
||||||
while (rs.next()) {
|
|
||||||
ret.add(rs.getString(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws SQLException {
|
|
||||||
knownUrlsQuery.close();
|
|
||||||
goodUrlsQuery.close();
|
|
||||||
visitedUrlsQuery.close();
|
|
||||||
allDomainsQuery.close();
|
|
||||||
crawlQueueDomains.close();
|
|
||||||
domainNameToId.close();
|
|
||||||
connection.close();
|
|
||||||
}
|
|
||||||
}
|
|
@@ -0,0 +1,5 @@
|
|||||||
|
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
|
||||||
|
ID INT NOT NULL AUTO_INCREMENT,
|
||||||
|
TIER INT NOT NULL,
|
||||||
|
PRIMARY KEY (ID)
|
||||||
|
);
|
@@ -0,0 +1,213 @@
|
|||||||
|
|
||||||
|
-- Create metadata tables for domain ping status and security information
|
||||||
|
|
||||||
|
-- These are not ICMP pings, but rather HTTP(S) pings to check the availability and security
|
||||||
|
-- of web servers associated with domains, to assess uptime and changes in security configurations
|
||||||
|
-- indicating ownership changes or security issues.
|
||||||
|
|
||||||
|
-- Note: DOMAIN_ID and NODE_ID are used to identify the domain and the node that performed the ping.
|
||||||
|
-- These are strictly speaking foreign keys to the EC_DOMAIN table, but as it
|
||||||
|
-- is strictly append-only, we do not need to enforce foreign key constraints.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION (
|
||||||
|
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
SERVER_AVAILABLE BOOLEAN NOT NULL, -- Indicates if the server is available (true) or not (false)
|
||||||
|
SERVER_IP VARBINARY(16), -- IP address of the server (IPv4 or IPv6)
|
||||||
|
SERVER_IP_ASN INTEGER, -- Autonomous System number
|
||||||
|
|
||||||
|
DATA_HASH BIGINT, -- Hash of the data for integrity checks
|
||||||
|
SECURITY_CONFIG_HASH BIGINT, -- Hash of the security configuration for integrity checks
|
||||||
|
|
||||||
|
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||||
|
HTTP_ETAG VARCHAR(255), -- ETag of the resource as per HTTP headers
|
||||||
|
HTTP_LAST_MODIFIED VARCHAR(255), -- Last modified date of the resource as per HTTP headers
|
||||||
|
HTTP_STATUS INT, -- HTTP status code (e.g., 200, 404, etc.)
|
||||||
|
HTTP_LOCATION VARCHAR(255), -- If the server redirects, this is the location of the redirect
|
||||||
|
HTTP_RESPONSE_TIME_MS SMALLINT UNSIGNED, -- Response time in milliseconds
|
||||||
|
|
||||||
|
ERROR_CLASSIFICATION ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN'), -- Classification of the error if the server is not available
|
||||||
|
ERROR_MESSAGE VARCHAR(255), -- Error message if the server is not available
|
||||||
|
|
||||||
|
TS_LAST_PING TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the last ping
|
||||||
|
TS_LAST_AVAILABLE TIMESTAMP, -- Timestamp of the last time the server was available
|
||||||
|
TS_LAST_ERROR TIMESTAMP, -- Timestamp of the last error encountered
|
||||||
|
|
||||||
|
NEXT_SCHEDULED_UPDATE TIMESTAMP NOT NULL,
|
||||||
|
BACKOFF_CONSECUTIVE_FAILURES INT NOT NULL DEFAULT 0, -- Number of consecutive failures to ping the server
|
||||||
|
BACKOFF_FETCH_INTERVAL INT NOT NULL DEFAULT 60 -- Interval in seconds for the next scheduled ping
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NEXT_SCHEDULED_UPDATE_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, NEXT_SCHEDULED_UPDATE);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_INFORMATION (
|
||||||
|
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
ASN INTEGER, -- Autonomous System Number (ASN) of the server
|
||||||
|
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||||
|
HTTP_VERSION VARCHAR(10), -- HTTP version used (e.g., HTTP/1.1, HTTP/2)
|
||||||
|
HTTP_COMPRESSION VARCHAR(50), -- Compression method used (e.g., gzip, deflate, br)
|
||||||
|
HTTP_CACHE_CONTROL TEXT, -- Cache control directives from HTTP headers
|
||||||
|
|
||||||
|
SSL_CERT_NOT_BEFORE TIMESTAMP, -- Valid from date (usually same as issued)
|
||||||
|
SSL_CERT_NOT_AFTER TIMESTAMP, -- Valid until date (usually same as expires)
|
||||||
|
|
||||||
|
SSL_CERT_ISSUER VARCHAR(255), -- CA that issued the cert
|
||||||
|
SSL_CERT_SUBJECT VARCHAR(255), -- Certificate subject/CN
|
||||||
|
|
||||||
|
SSL_CERT_PUBLIC_KEY_HASH BINARY(32), -- SHA-256 hash of the public key
|
||||||
|
SSL_CERT_SERIAL_NUMBER VARCHAR(100), -- Unique cert serial number
|
||||||
|
SSL_CERT_FINGERPRINT_SHA256 BINARY(32), -- SHA-256 fingerprint for exact identification
|
||||||
|
SSL_CERT_SAN TEXT, -- Subject Alternative Names (JSON array)
|
||||||
|
SSL_CERT_WILDCARD BOOLEAN, -- Wildcard certificate (*.example.com)
|
||||||
|
|
||||||
|
SSL_PROTOCOL VARCHAR(20), -- TLS 1.2, TLS 1.3, etc.
|
||||||
|
SSL_CIPHER_SUITE VARCHAR(100), -- e.g., TLS_AES_256_GCM_SHA384
|
||||||
|
SSL_KEY_EXCHANGE VARCHAR(50), -- ECDHE, RSA, etc.
|
||||||
|
SSL_CERTIFICATE_CHAIN_LENGTH TINYINT, -- Number of certs in chain
|
||||||
|
|
||||||
|
SSL_CERTIFICATE_VALID BOOLEAN, -- Valid cert chain
|
||||||
|
|
||||||
|
HEADER_CORS_ALLOW_ORIGIN TEXT, -- Could be *, specific domains, or null
|
||||||
|
HEADER_CORS_ALLOW_CREDENTIALS BOOLEAN, -- Credential handling
|
||||||
|
HEADER_CONTENT_SECURITY_POLICY_HASH INT, -- CSP header, hash of the policy
|
||||||
|
HEADER_STRICT_TRANSPORT_SECURITY VARCHAR(255), -- HSTS header
|
||||||
|
HEADER_REFERRER_POLICY VARCHAR(50), -- Referrer handling
|
||||||
|
HEADER_X_FRAME_OPTIONS VARCHAR(50), -- Clickjacking protection
|
||||||
|
HEADER_X_CONTENT_TYPE_OPTIONS VARCHAR(50), -- MIME sniffing protection
|
||||||
|
HEADER_X_XSS_PROTECTION VARCHAR(50), -- XSS protection header
|
||||||
|
|
||||||
|
HEADER_SERVER VARCHAR(255), -- Server header (e.g., Apache, Nginx, etc.)
|
||||||
|
HEADER_X_POWERED_BY VARCHAR(255), -- X-Powered-By header (if present)
|
||||||
|
|
||||||
|
TS_LAST_UPDATE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -- Timestamp of the last SSL check
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_EVENTS (
|
||||||
|
CHANGE_ID BIGINT AUTO_INCREMENT PRIMARY KEY, -- Unique identifier for the change
|
||||||
|
DOMAIN_ID INT NOT NULL, -- Domain ID, used as a foreign key to EC_DOMAIN
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the change
|
||||||
|
|
||||||
|
CHANGE_ASN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to ASN (Autonomous System Number)
|
||||||
|
CHANGE_CERTIFICATE_FINGERPRINT BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate fingerprint
|
||||||
|
CHANGE_CERTIFICATE_PROFILE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate profile (e.g., algorithm, exchange)
|
||||||
|
CHANGE_CERTIFICATE_SAN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate SAN (Subject Alternative Name)
|
||||||
|
CHANGE_CERTIFICATE_PUBLIC_KEY BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate public key
|
||||||
|
CHANGE_SECURITY_HEADERS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to security headers
|
||||||
|
CHANGE_IP_ADDRESS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to IP address
|
||||||
|
CHANGE_SOFTWARE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to the generator (e.g., web server software)
|
||||||
|
OLD_CERT_TIME_TO_EXPIRY INT, -- Time to expiry of the old certificate in hours, if applicable
|
||||||
|
|
||||||
|
SECURITY_SIGNATURE_BEFORE BLOB NOT NULL, -- Security signature before the change, gzipped json record
|
||||||
|
SECURITY_SIGNATURE_AFTER BLOB NOT NULL -- Security signature after the change, gzipped json record
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_EVENTS (NODE_ID, DOMAIN_ID);
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_SECURITY_EVENTS (TS_CHANGE);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_EVENTS (
|
||||||
|
DOMAIN_ID INT NOT NULL,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
AVAILABLE BOOLEAN NOT NULL, -- True if the service is available, false if it is not
|
||||||
|
OUTAGE_TYPE ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN') NOT NULL,
|
||||||
|
HTTP_STATUS_CODE INT, -- HTTP status code if available (e.g., 200, 404, etc.)
|
||||||
|
ERROR_MESSAGE VARCHAR(255), -- Specific error details
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- Timestamp of the last update
|
||||||
|
|
||||||
|
AVAILABILITY_RECORD_ID BIGINT AUTO_INCREMENT,
|
||||||
|
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||||
|
PRIMARY KEY (AVAILABILITY_RECORD_ID, P_KEY_MONTH)
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||||
|
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||||
|
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||||
|
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||||
|
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||||
|
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||||
|
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||||
|
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||||
|
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||||
|
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||||
|
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||||
|
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||||
|
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||||
|
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__DOMAIN_ID_TS_IDX ON DOMAIN_AVAILABILITY_EVENTS (DOMAIN_ID, TS_CHANGE);
|
||||||
|
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_AVAILABILITY_EVENTS (TS_CHANGE);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_INFORMATION (
|
||||||
|
DNS_ROOT_DOMAIN_ID INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ROOT_DOMAIN_NAME VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
NODE_AFFINITY INT NOT NULL, -- Node ID that performs the DNS check, assign randomly across nodes
|
||||||
|
|
||||||
|
DNS_A_RECORDS TEXT, -- JSON array of IPv4 addresses
|
||||||
|
DNS_AAAA_RECORDS TEXT, -- JSON array of IPv6 addresses
|
||||||
|
DNS_CNAME_RECORD VARCHAR(255), -- Canonical name (if applicable)
|
||||||
|
DNS_MX_RECORDS TEXT, -- JSON array of mail exchange records
|
||||||
|
DNS_CAA_RECORDS TEXT, -- Certificate Authority Authorization
|
||||||
|
DNS_TXT_RECORDS TEXT, -- TXT records (SPF, DKIM, verification, etc.)
|
||||||
|
DNS_NS_RECORDS TEXT, -- Name servers (JSON array)
|
||||||
|
DNS_SOA_RECORD TEXT, -- Start of Authority (JSON object)
|
||||||
|
|
||||||
|
TS_LAST_DNS_CHECK TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
TS_NEXT_DNS_CHECK TIMESTAMP NOT NULL,
|
||||||
|
DNS_CHECK_PRIORITY TINYINT DEFAULT 0 -- Priority of the DNS check, in case we want to schedule a refresh sooner
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_DNS_INFORMATION__PRIORITY_NEXT_CHECK_IDX ON DOMAIN_DNS_INFORMATION (NODE_AFFINITY, DNS_CHECK_PRIORITY DESC, TS_NEXT_DNS_CHECK);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_EVENTS (
|
||||||
|
DNS_ROOT_DOMAIN_ID INT NOT NULL,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
|
||||||
|
-- DNS change type flags
|
||||||
|
CHANGE_A_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv4 address changes
|
||||||
|
CHANGE_AAAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv6 address changes
|
||||||
|
CHANGE_CNAME BOOLEAN NOT NULL DEFAULT FALSE, -- CNAME changes
|
||||||
|
CHANGE_MX_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Mail server changes
|
||||||
|
CHANGE_CAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Certificate authority changes
|
||||||
|
CHANGE_TXT_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- TXT record changes (SPF, DKIM, etc.)
|
||||||
|
CHANGE_NS_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Name server changes (big red flag!)
|
||||||
|
CHANGE_SOA_RECORD BOOLEAN NOT NULL DEFAULT FALSE, -- Start of Authority changes
|
||||||
|
|
||||||
|
DNS_SIGNATURE_BEFORE BLOB NOT NULL, -- Compressed JSON snapshot of DNS records before change
|
||||||
|
DNS_SIGNATURE_AFTER BLOB NOT NULL, -- Compressed JSON snapshot of DNS records after change
|
||||||
|
|
||||||
|
DNS_EVENT_ID BIGINT AUTO_INCREMENT,
|
||||||
|
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||||
|
PRIMARY KEY (DNS_EVENT_ID, P_KEY_MONTH)
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||||
|
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||||
|
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||||
|
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||||
|
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||||
|
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||||
|
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||||
|
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||||
|
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||||
|
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||||
|
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||||
|
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||||
|
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||||
|
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_DNS_EVENTS__DNS_ROOT_DOMAIN_ID_TS_IDX ON DOMAIN_DNS_EVENTS (DNS_ROOT_DOMAIN_ID, TS_CHANGE);
|
||||||
|
CREATE INDEX DOMAIN_DNS_EVENTS__TS_CHANGE_IDX ON DOMAIN_DNS_EVENTS (TS_CHANGE);
|
@@ -0,0 +1,6 @@
|
|||||||
|
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||||
|
-- to make it easier to make sense of certificate changes
|
||||||
|
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_SERIAL_NUMBER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_ISSUER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
|
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,5 @@
|
|||||||
|
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||||
|
-- to make it easier to make sense of certificate changes
|
||||||
|
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NONE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
|
||||||
|
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.model;
|
||||||
|
|
||||||
|
public enum DocumentFormat {
|
||||||
|
PLAIN(0, 1, "text"),
|
||||||
|
PDF(0, 1, "pdf"),
|
||||||
|
UNKNOWN(0, 1, "???"),
|
||||||
|
HTML123(0, 1, "html"),
|
||||||
|
HTML4(-0.1, 1.05, "html"),
|
||||||
|
XHTML(-0.1, 1.05, "html"),
|
||||||
|
HTML5(0.5, 1.1, "html");
|
||||||
|
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double offset;
|
||||||
|
/** Used to tune quality score */
|
||||||
|
public final double scale;
|
||||||
|
public final String shortFormat;
|
||||||
|
|
||||||
|
DocumentFormat(double offset, double scale, String shortFormat) {
|
||||||
|
this.offset = offset;
|
||||||
|
this.scale = scale;
|
||||||
|
this.shortFormat = shortFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
|
|||||||
@Nonnull
|
@Nonnull
|
||||||
public final String topDomain;
|
public final String topDomain;
|
||||||
|
|
||||||
public EdgeDomain(String host) {
|
public EdgeDomain(@Nonnull String host) {
|
||||||
Objects.requireNonNull(host, "domain name must not be null");
|
Objects.requireNonNull(host, "domain name must not be null");
|
||||||
|
|
||||||
host = host.toLowerCase();
|
host = host.toLowerCase();
|
||||||
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
|
|||||||
this.topDomain = topDomain;
|
this.topDomain = topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getTopDomain(String host) {
|
||||||
|
return new EdgeDomain(host).topDomain;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean looksLikeGovTld(String host) {
|
private boolean looksLikeGovTld(String host) {
|
||||||
if (host.length() < 8)
|
if (host.length() < 8)
|
||||||
return false;
|
return false;
|
||||||
@@ -108,32 +112,6 @@ public class EdgeDomain implements Serializable {
|
|||||||
return topDomain;
|
return topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDomainKey() {
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
return topDomain;
|
|
||||||
}
|
|
||||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLongDomainKey() {
|
|
||||||
StringBuilder ret = new StringBuilder();
|
|
||||||
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
ret.append(topDomain);
|
|
||||||
} else {
|
|
||||||
ret.append(topDomain, 0, cutPoint);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
|
|
||||||
ret.append(":");
|
|
||||||
ret.append(subDomain);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret.toString().toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** If possible, try to provide an alias domain,
|
/** If possible, try to provide an alias domain,
|
||||||
* i.e. a domain name that is very likely to link to this one
|
* i.e. a domain name that is very likely to link to this one
|
||||||
* */
|
* */
|
||||||
|
@@ -1,16 +1,14 @@
|
|||||||
package nu.marginalia.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
import nu.marginalia.util.QueryParams;
|
import nu.marginalia.util.QueryParams;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.net.MalformedURLException;
|
import java.net.*;
|
||||||
import java.net.URI;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
public class EdgeUrl implements Serializable {
|
public class EdgeUrl implements Serializable {
|
||||||
public final String proto;
|
public final String proto;
|
||||||
@@ -33,7 +31,7 @@ public class EdgeUrl implements Serializable {
|
|||||||
|
|
||||||
private static URI parseURI(String url) throws URISyntaxException {
|
private static URI parseURI(String url) throws URISyntaxException {
|
||||||
try {
|
try {
|
||||||
return new URI(urlencodeFixer(url));
|
return EdgeUriFactory.parseURILenient(url);
|
||||||
} catch (URISyntaxException ex) {
|
} catch (URISyntaxException ex) {
|
||||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||||
}
|
}
|
||||||
@@ -51,58 +49,6 @@ public class EdgeUrl implements Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
|
||||||
|
|
||||||
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
|
||||||
|
|
||||||
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
|
||||||
and what you get is more like what's on the inside, we try to patch things instead,
|
|
||||||
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
|
||||||
like bad or missing URLEncoding
|
|
||||||
*/
|
|
||||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
|
||||||
var s = new StringBuilder();
|
|
||||||
String goodChars = "&.?:/-;+$#";
|
|
||||||
String hexChars = "0123456789abcdefABCDEF";
|
|
||||||
|
|
||||||
int pathIdx = findPathIdx(url);
|
|
||||||
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
|
||||||
return url + "/";
|
|
||||||
}
|
|
||||||
s.append(url, 0, pathIdx);
|
|
||||||
|
|
||||||
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
|
||||||
int end = url.indexOf("#");
|
|
||||||
if (end < 0) end = url.length();
|
|
||||||
|
|
||||||
for (int i = pathIdx; i < end; i++) {
|
|
||||||
int c = url.charAt(i);
|
|
||||||
|
|
||||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
|
||||||
s.appendCodePoint(c);
|
|
||||||
} else if (c == '%' && i + 2 < end) {
|
|
||||||
int cn = url.charAt(i + 1);
|
|
||||||
int cnn = url.charAt(i + 2);
|
|
||||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
|
||||||
s.appendCodePoint(c);
|
|
||||||
} else {
|
|
||||||
s.append("%25");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
s.append(String.format("%%%02X", c));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return s.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int findPathIdx(String url) throws URISyntaxException {
|
|
||||||
int colonIdx = url.indexOf(':');
|
|
||||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
|
||||||
throw new URISyntaxException(url, "Lacking protocol");
|
|
||||||
}
|
|
||||||
return url.indexOf('/', colonIdx + 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
public EdgeUrl(URI URI) {
|
public EdgeUrl(URI URI) {
|
||||||
try {
|
try {
|
||||||
@@ -166,11 +112,32 @@ public class EdgeUrl implements Serializable {
|
|||||||
sb.append(port);
|
sb.append(port);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EdgeUriFactory.urlencodePath(sb, path);
|
||||||
|
|
||||||
|
if (param != null) {
|
||||||
|
EdgeUriFactory.urlencodeQuery(sb, param);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toDisplayString() {
|
||||||
|
StringBuilder sb = new StringBuilder(256);
|
||||||
|
|
||||||
|
sb.append(proto);
|
||||||
|
sb.append("://");
|
||||||
|
sb.append(domain);
|
||||||
|
|
||||||
|
if (port != null) {
|
||||||
|
sb.append(':');
|
||||||
|
sb.append(port);
|
||||||
|
}
|
||||||
|
|
||||||
sb.append(path);
|
sb.append(path);
|
||||||
|
|
||||||
if (param != null) {
|
if (param != null) {
|
||||||
sb.append('?');
|
sb.append('?').append(param);
|
||||||
sb.append(param);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
@@ -247,3 +214,244 @@ public class EdgeUrl implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class EdgeUriFactory {
|
||||||
|
public static URI parseURILenient(String url) throws URISyntaxException {
|
||||||
|
|
||||||
|
if (shouldOmitUrlencodeRepair(url)) {
|
||||||
|
try {
|
||||||
|
return new URI(url);
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex) {
|
||||||
|
// ignore and run the lenient parser
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var s = new StringBuilder(url.length()+8);
|
||||||
|
|
||||||
|
int pathIdx = findPathIdx(url);
|
||||||
|
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||||
|
return new URI(url + "/");
|
||||||
|
}
|
||||||
|
s.append(url, 0, pathIdx);
|
||||||
|
|
||||||
|
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||||
|
int end = url.indexOf("#");
|
||||||
|
if (end < 0) end = url.length();
|
||||||
|
|
||||||
|
int queryIdx = url.indexOf('?');
|
||||||
|
if (queryIdx < 0) queryIdx = end;
|
||||||
|
|
||||||
|
urlencodePath(s, url.substring(pathIdx, queryIdx));
|
||||||
|
if (queryIdx < end) {
|
||||||
|
urlencodeQuery(s, url.substring(queryIdx + 1, end));
|
||||||
|
}
|
||||||
|
return new URI(s.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Break apart the path element of an URI into its components, and then
|
||||||
|
* urlencode any component that needs it, and recombine it into a single
|
||||||
|
* path element again.
|
||||||
|
*/
|
||||||
|
public static void urlencodePath(StringBuilder sb, String path) {
|
||||||
|
if (path == null || path.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] pathParts = StringUtils.split(path, '/');
|
||||||
|
if (pathParts.length == 0) {
|
||||||
|
sb.append('/');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean shouldUrlEncode = false;
|
||||||
|
for (String pathPart : pathParts) {
|
||||||
|
if (pathPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (needsUrlEncode(pathPart)) {
|
||||||
|
shouldUrlEncode = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String pathPart : pathParts) {
|
||||||
|
if (pathPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (shouldUrlEncode) {
|
||||||
|
sb.append('/');
|
||||||
|
sb.append(URLEncoder.encode(pathPart, StandardCharsets.UTF_8).replace("+", "%20"));
|
||||||
|
} else {
|
||||||
|
sb.append('/');
|
||||||
|
sb.append(pathPart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.endsWith("/")) {
|
||||||
|
sb.append('/');
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Break apart the query element of a URI into its components, and then
|
||||||
|
* urlencode any component that needs it, and recombine it into a single
|
||||||
|
* query element again.
|
||||||
|
*/
|
||||||
|
public static void urlencodeQuery(StringBuilder sb, String param) {
|
||||||
|
if (param == null || param.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] queryParts = StringUtils.split(param, '&');
|
||||||
|
|
||||||
|
boolean shouldUrlEncode = false;
|
||||||
|
for (String queryPart : queryParts) {
|
||||||
|
if (queryPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (needsUrlEncode(queryPart)) {
|
||||||
|
shouldUrlEncode = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean first = true;
|
||||||
|
for (String queryPart : queryParts) {
|
||||||
|
if (queryPart.isEmpty()) continue;
|
||||||
|
|
||||||
|
if (first) {
|
||||||
|
sb.append('?');
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
sb.append('&');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (shouldUrlEncode) {
|
||||||
|
int idx = queryPart.indexOf('=');
|
||||||
|
if (idx < 0) {
|
||||||
|
sb.append(URLEncoder.encode(queryPart, StandardCharsets.UTF_8));
|
||||||
|
} else {
|
||||||
|
sb.append(URLEncoder.encode(queryPart.substring(0, idx), StandardCharsets.UTF_8));
|
||||||
|
sb.append('=');
|
||||||
|
sb.append(URLEncoder.encode(queryPart.substring(idx + 1), StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sb.append(queryPart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test if the url element needs URL encoding.
|
||||||
|
* <p></p>
|
||||||
|
* Note we may have been given an already encoded path element,
|
||||||
|
* so we include % and + in the list of good characters
|
||||||
|
*/
|
||||||
|
static boolean needsUrlEncode(String urlElement) {
|
||||||
|
for (int i = 0; i < urlElement.length(); i++) {
|
||||||
|
char c = urlElement.charAt(i);
|
||||||
|
|
||||||
|
if (isUrlSafe(c)) continue;
|
||||||
|
if ("+".indexOf(c) >= 0) continue;
|
||||||
|
if (c == '%' && i + 2 < urlElement.length()) {
|
||||||
|
char c1 = urlElement.charAt(i + 1);
|
||||||
|
char c2 = urlElement.charAt(i + 2);
|
||||||
|
if (isHexDigit(c1) && isHexDigit(c2)) {
|
||||||
|
i += 2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static boolean isUrlSafe(int c) {
|
||||||
|
if (c >= 'a' && c <= 'z') return true;
|
||||||
|
if (c >= 'A' && c <= 'Z') return true;
|
||||||
|
if (c >= '0' && c <= '9') return true;
|
||||||
|
if (c == '-' || c == '_' || c == '.' || c == '~') return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test if the URL is a valid URL that does not need to be
|
||||||
|
* urlencoded.
|
||||||
|
* <p></p>
|
||||||
|
* This is a very simple heuristic test that does not guarantee
|
||||||
|
* that the URL is valid, but it will identify cases where we
|
||||||
|
* are fairly certain that the URL does not need encoding,
|
||||||
|
* so we can skip a bunch of allocations and string operations
|
||||||
|
* that would otherwise be needed to fix the URL.
|
||||||
|
*/
|
||||||
|
static boolean shouldOmitUrlencodeRepair(String url) {
|
||||||
|
int idx = 0;
|
||||||
|
final int len = url.length();
|
||||||
|
|
||||||
|
// Validate the scheme
|
||||||
|
while (idx < len - 2) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == ':') break;
|
||||||
|
if (!isAsciiAlphabetic(c)) return false;
|
||||||
|
}
|
||||||
|
if (url.charAt(idx++) != '/') return false;
|
||||||
|
if (url.charAt(idx++) != '/') return false;
|
||||||
|
|
||||||
|
// Validate the authority
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '/') break;
|
||||||
|
if (c == ':') continue;
|
||||||
|
if (c == '@') continue;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate the path
|
||||||
|
if (idx >= len) return true;
|
||||||
|
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '?') break;
|
||||||
|
if (c == '/') continue;
|
||||||
|
if (c == '#') return true;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx >= len) return true;
|
||||||
|
|
||||||
|
// Validate the query
|
||||||
|
while (idx < len) {
|
||||||
|
char c = url.charAt(idx++);
|
||||||
|
if (c == '&') continue;
|
||||||
|
if (c == '=') continue;
|
||||||
|
if (c == '#') return true;
|
||||||
|
if (!isUrlSafe(c)) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isAsciiAlphabetic(int c) {
|
||||||
|
return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isHexDigit(int c) {
|
||||||
|
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Find the index of the path element in a URL.
|
||||||
|
* <p></p>
|
||||||
|
* The path element starts after the scheme and authority part of the URL,
|
||||||
|
* which is everything up to and including the first slash after the colon.
|
||||||
|
*/
|
||||||
|
private static int findPathIdx(String url) throws URISyntaxException {
|
||||||
|
int colonIdx = url.indexOf(':');
|
||||||
|
if (colonIdx < 0 || colonIdx + 3 >= url.length()) {
|
||||||
|
throw new URISyntaxException(url, "Lacking scheme");
|
||||||
|
}
|
||||||
|
return url.indexOf('/', colonIdx + 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -28,6 +28,8 @@ public enum HtmlFeature {
|
|||||||
|
|
||||||
GA_SPAM("special:gaspam"),
|
GA_SPAM("special:gaspam"),
|
||||||
|
|
||||||
|
PDF("format:pdf"),
|
||||||
|
|
||||||
/** For fingerprinting and ranking */
|
/** For fingerprinting and ranking */
|
||||||
OPENGRAPH("special:opengraph"),
|
OPENGRAPH("special:opengraph"),
|
||||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||||
|
@@ -6,11 +6,20 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
public class GsonFactory {
|
public class GsonFactory {
|
||||||
public static Gson get() {
|
public static Gson get() {
|
||||||
return new GsonBuilder()
|
return new GsonBuilder()
|
||||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||||
|
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toEpochMilli()))
|
||||||
|
.registerTypeAdapter(Instant.class, (JsonDeserializer<Instant>) (json, typeOfT, context) -> {
|
||||||
|
if (json.isJsonPrimitive() && json.getAsJsonPrimitive().isNumber()) {
|
||||||
|
return Instant.ofEpochMilli(json.getAsLong());
|
||||||
|
} else {
|
||||||
|
throw new JsonParseException("Expected a number for Instant");
|
||||||
|
}
|
||||||
|
})
|
||||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||||
|
@@ -1,22 +0,0 @@
|
|||||||
package nu.marginalia.model.html;
|
|
||||||
|
|
||||||
// This class really doesn't belong anywhere, but will squat here for now
|
|
||||||
public enum HtmlStandard {
|
|
||||||
PLAIN(0, 1),
|
|
||||||
UNKNOWN(0, 1),
|
|
||||||
HTML123(0, 1),
|
|
||||||
HTML4(-0.1, 1.05),
|
|
||||||
XHTML(-0.1, 1.05),
|
|
||||||
HTML5(0.5, 1.1);
|
|
||||||
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double offset;
|
|
||||||
/** Used to tune quality score */
|
|
||||||
public final double scale;
|
|
||||||
|
|
||||||
HtmlStandard(double offset, double scale) {
|
|
||||||
this.offset = offset;
|
|
||||||
this.scale = scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@@ -9,7 +9,7 @@ public enum DocumentFlags {
|
|||||||
GeneratorForum,
|
GeneratorForum,
|
||||||
GeneratorWiki,
|
GeneratorWiki,
|
||||||
Sideloaded,
|
Sideloaded,
|
||||||
Unused7,
|
PdfFile,
|
||||||
Unused8,
|
Unused8,
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@@ -83,6 +83,11 @@ public class QueryParams {
|
|||||||
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
if (path.endsWith("StoryView.py")) { // folklore.org is neat
|
||||||
return param.startsWith("project=") || param.startsWith("story=");
|
return param.startsWith("project=") || param.startsWith("story=");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// www.perseus.tufts.edu:
|
||||||
|
if (param.startsWith("collection=")) return true;
|
||||||
|
if (param.startsWith("doc=")) return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||||||
|
|
||||||
class EdgeDomainTest {
|
class EdgeDomainTest {
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testSkepdic() throws URISyntaxException {
|
|
||||||
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
|
|
||||||
assertEquals("skepdic", domain.getDomain().getDomainKey());
|
|
||||||
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
|
|
||||||
assertEquals("skepdic", domain2.getDomain().getDomainKey());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testHkDomain() throws URISyntaxException {
|
public void testHkDomain() throws URISyntaxException {
|
||||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.model;
|
package nu.marginalia.model;
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
@@ -21,25 +21,70 @@ class EdgeUrlTest {
|
|||||||
new EdgeUrl("https://memex.marginalia.nu/#here")
|
new EdgeUrl("https://memex.marginalia.nu/#here")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testParam() throws URISyntaxException {
|
void testUriFromString() throws URISyntaxException {
|
||||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/index.php?id=1").toString());
|
// We test these URLs several times as we perform URLEncode-fixing both when parsing the URL and when
|
||||||
System.out.println(new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
// converting it back to a string, we want to ensure there is no changes along the way.
|
||||||
}
|
|
||||||
@Test
|
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/").getPath());
|
||||||
void urlencodeFixer() throws URISyntaxException {
|
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/").toString());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/").toString());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
Assertions.assertEquals("/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").getPath());
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
Assertions.assertEquals("https://www.example.com/", EdgeUriFactory.parseURILenient("https://www.example.com/#heredoc").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/", new EdgeUrl("https://www.example.com/#heredoc").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/trailingslash/", EdgeUriFactory.parseURILenient("https://www.example.com/trailingslash/").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/trailingslash/", new EdgeUrl("https://www.example.com/trailingslash/").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/%-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign", new EdgeUrl("https://www.example.com/%-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/%-sign/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com//%-sign/\"-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%25-sign/%22-sign", new EdgeUrl("https://www.example.com//%-sign/\"-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/\"-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%22-sign", EdgeUriFactory.parseURILenient("https://www.example.com/%22-sign").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%22-sign", new EdgeUrl("https://www.example.com/%22-sign").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/\n \"huh\"", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").getPath());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", EdgeUriFactory.parseURILenient("https://www.example.com/\n \"huh\"").toString());
|
||||||
|
Assertions.assertEquals("https://www.example.com/%0A%20%22huh%22", new EdgeUrl("https://www.example.com/\n \"huh\"").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("/wiki/Sámi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").getPath());
|
||||||
|
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", EdgeUriFactory.parseURILenient("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||||
|
Assertions.assertEquals("https://en.wikipedia.org/wiki/S%C3%A1mi", new EdgeUrl("https://en.wikipedia.org/wiki/Sámi").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k", new EdgeUrl("https://www.prijatelji-zivotinja.hr/index.en.php?id=2301k").toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testParms() throws URISyntaxException {
|
void testParms() throws URISyntaxException {
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
|
Assertions.assertEquals("id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").param);
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
|
Assertions.assertEquals("https://search.marginalia.nu/?id=123", new EdgeUrl("https://search.marginalia.nu/?id=123").toString());
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
|
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
|
Assertions.assertEquals("t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").param);
|
||||||
System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
|
Assertions.assertEquals("https://search.marginalia.nu/?t=123", new EdgeUrl("https://search.marginalia.nu/?t=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").param);
|
||||||
|
Assertions.assertEquals("https://search.marginalia.nu/?v=123", new EdgeUrl("https://search.marginalia.nu/?v=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1", new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/showthread.php?id=1",
|
||||||
|
new EdgeUrl("https://memex.marginalia.nu/showthread.php?id=1&count=5&tracking=123").toString());
|
||||||
|
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?id=1&t=5&tracking=123").toString());
|
||||||
|
|
||||||
|
Assertions.assertEquals("id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").param);
|
||||||
|
Assertions.assertEquals("https://memex.marginalia.nu/sh%C3%B6wthr%C3%ABad.php?id=1&t=5", new EdgeUrl("https://memex.marginalia.nu/shöwthrëad.php?trëaking=123&id=1&t=5&").toString());
|
||||||
|
|
||||||
|
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?m=123").param);
|
||||||
|
Assertions.assertNull(new EdgeUrl("https://search.marginalia.nu/?follow=123").param);
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -59,16 +59,13 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void progress(String step, int stepProgress, int stepCount) {
|
public void progress(String step, int stepProgress, int stepCount) {
|
||||||
|
int lastProgress = this.progress;
|
||||||
this.step = step;
|
this.step = step;
|
||||||
|
|
||||||
|
|
||||||
// off by one since we calculate the progress based on the number of steps,
|
|
||||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
|
||||||
// final progress being 80% and not 100%)
|
|
||||||
|
|
||||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||||
|
|
||||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
if (this.progress / 10 != lastProgress / 10) {
|
||||||
|
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||||
|
@@ -0,0 +1,59 @@
|
|||||||
|
package nu.marginalia.process.control;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class ProcessEventLog {
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(ProcessEventLog.class);
|
||||||
|
|
||||||
|
private final String serviceName;
|
||||||
|
private final UUID instanceUuid;
|
||||||
|
private final String serviceBase;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ProcessEventLog(HikariDataSource dataSource, ProcessConfiguration configuration) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
|
||||||
|
this.serviceName = configuration.processName() + ":" + configuration.node();
|
||||||
|
this.instanceUuid = configuration.instanceUuid();
|
||||||
|
this.serviceBase = configuration.processName();
|
||||||
|
|
||||||
|
logger.info("Starting service {} instance {}", serviceName, instanceUuid);
|
||||||
|
|
||||||
|
logEvent("PCS-START", serviceName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void logEvent(Class<?> type, String message) {
|
||||||
|
logEvent(type.getSimpleName(), message);
|
||||||
|
}
|
||||||
|
public void logEvent(String type, String message) {
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, serviceName);
|
||||||
|
stmt.setString(2, serviceBase);
|
||||||
|
stmt.setString(3, instanceUuid.toString());
|
||||||
|
stmt.setString(4, type);
|
||||||
|
stmt.setString(5, Objects.requireNonNull(message, ""));
|
||||||
|
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to log event {}:{}", type, message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.util.*;
|
import java.util.HashSet;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
/** WorkLog is a journal of work done by a process,
|
/** WorkLog is a journal of work done by a process,
|
||||||
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
|
|||||||
return new WorkLoadIterable<>(logFile, mapper);
|
return new WorkLoadIterable<>(logFile, mapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int countEntries(Path crawlerLog) throws IOException{
|
||||||
|
try (var linesStream = Files.lines(crawlerLog)) {
|
||||||
|
return (int) linesStream.filter(WorkLogEntry::isJobId).count();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Use synchro over concurrent set to avoid competing writes
|
// Use synchro over concurrent set to avoid competing writes
|
||||||
// - correct is better than fast here, it's sketchy enough to use
|
// - correct is better than fast here, it's sketchy enough to use
|
||||||
// a PrintWriter
|
// a PrintWriter
|
||||||
|
@@ -57,16 +57,13 @@ public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHo
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void progress(String step, int stepProgress, int stepCount) {
|
public void progress(String step, int stepProgress, int stepCount) {
|
||||||
|
int lastProgress = this.progress;
|
||||||
this.step = step;
|
this.step = step;
|
||||||
|
|
||||||
|
|
||||||
// off by one since we calculate the progress based on the number of steps,
|
|
||||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
|
||||||
// final progress being 80% and not 100%)
|
|
||||||
|
|
||||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||||
|
|
||||||
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
|
if (this.progress / 10 != lastProgress / 10) {
|
||||||
|
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void shutDown() {
|
public void shutDown() {
|
||||||
|
@@ -1,17 +1,21 @@
|
|||||||
package nu.marginalia.service.discovery;
|
package nu.marginalia.service.discovery;
|
||||||
|
|
||||||
import nu.marginalia.service.discovery.monitor.*;
|
import com.google.inject.ImplementedBy;
|
||||||
|
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||||
|
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
|
||||||
|
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
|
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||||
|
|
||||||
/** A service registry that allows services to register themselves and
|
/** A service registry that allows services to register themselves and
|
||||||
* be discovered by other services on the network.
|
* be discovered by other services on the network.
|
||||||
*/
|
*/
|
||||||
|
@ImplementedBy(ZkServiceRegistry.class)
|
||||||
public interface ServiceRegistryIf {
|
public interface ServiceRegistryIf {
|
||||||
/**
|
/**
|
||||||
* Register a service with the registry.
|
* Register a service with the registry.
|
||||||
@@ -57,4 +61,9 @@ public interface ServiceRegistryIf {
|
|||||||
* </ul>
|
* </ul>
|
||||||
* */
|
* */
|
||||||
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
||||||
|
|
||||||
|
void registerProcess(String processName, int nodeId);
|
||||||
|
void deregisterProcess(String processName, int nodeId);
|
||||||
|
|
||||||
|
InterProcessSemaphoreV2 getSemaphore(String name, int permits) throws Exception;
|
||||||
}
|
}
|
||||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
|||||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
import org.apache.curator.framework.CuratorFramework;
|
import org.apache.curator.framework.CuratorFramework;
|
||||||
|
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||||
import org.apache.curator.utils.ZKPaths;
|
import org.apache.curator.utils.ZKPaths;
|
||||||
import org.apache.zookeeper.CreateMode;
|
import org.apache.zookeeper.CreateMode;
|
||||||
import org.apache.zookeeper.Watcher;
|
import org.apache.zookeeper.Watcher;
|
||||||
@@ -256,6 +257,42 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
|||||||
.forPath("/running-instances");
|
.forPath("/running-instances");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void registerProcess(String processName, int nodeId) {
|
||||||
|
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||||
|
try {
|
||||||
|
curatorFramework.create()
|
||||||
|
.creatingParentsIfNeeded()
|
||||||
|
.withMode(CreateMode.EPHEMERAL)
|
||||||
|
.forPath(path);
|
||||||
|
livenessPaths.add(path);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Failed to register process {} on node {}", processName, nodeId, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void deregisterProcess(String processName, int nodeId) {
|
||||||
|
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||||
|
try {
|
||||||
|
curatorFramework.delete().forPath(path);
|
||||||
|
livenessPaths.remove(path);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Failed to deregister process {} on node {}", processName, nodeId, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InterProcessSemaphoreV2 getSemaphore(String name, int permits) {
|
||||||
|
if (stopped)
|
||||||
|
throw new IllegalStateException("Service registry is stopped, cannot get semaphore " + name);
|
||||||
|
|
||||||
|
String path = "/semaphores/" + name;
|
||||||
|
return new InterProcessSemaphoreV2(curatorFramework, path, permits);
|
||||||
|
}
|
||||||
|
|
||||||
/* Exposed for tests */
|
/* Exposed for tests */
|
||||||
public synchronized void shutDown() {
|
public synchronized void shutDown() {
|
||||||
if (stopped)
|
if (stopped)
|
||||||
|
@@ -89,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
|
|||||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||||
|
|
||||||
config.setMaximumPoolSize(5);
|
config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
|
||||||
config.setMinimumIdle(2);
|
config.setMinimumIdle(2);
|
||||||
|
|
||||||
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
||||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.NetworkInterface;
|
import java.net.NetworkInterface;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
@@ -115,11 +116,12 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String getLocalNetworkIP() throws Exception {
|
public static String getLocalNetworkIP() throws IOException {
|
||||||
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
||||||
|
|
||||||
while (nets.hasMoreElements()) {
|
while (nets.hasMoreElements()) {
|
||||||
NetworkInterface netif = nets.nextElement();
|
NetworkInterface netif = nets.nextElement();
|
||||||
|
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
|
||||||
if (!netif.isUp() || netif.isLoopback()) {
|
if (!netif.isUp() || netif.isLoopback()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -127,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||||
while (inetAddresses.hasMoreElements()) {
|
while (inetAddresses.hasMoreElements()) {
|
||||||
InetAddress addr = inetAddresses.nextElement();
|
InetAddress addr = inetAddresses.nextElement();
|
||||||
|
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
|
||||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||||
return addr.getHostAddress();
|
return addr.getHostAddress();
|
||||||
}
|
}
|
||||||
|
@@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
import org.slf4j.Marker;
|
import org.slf4j.Marker;
|
||||||
import org.slf4j.MarkerFactory;
|
import org.slf4j.MarkerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -106,9 +107,12 @@ public class JoobyService {
|
|||||||
config.externalAddress());
|
config.externalAddress());
|
||||||
|
|
||||||
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
||||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
|
||||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||||
|
}
|
||||||
|
if (Files.exists(Path.of("/app/resources/static"))) {
|
||||||
|
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||||
|
}
|
||||||
var options = new ServerOptions();
|
var options = new ServerOptions();
|
||||||
options.setHost(config.bindAddress());
|
options.setHost(config.bindAddress());
|
||||||
options.setPort(restEndpoint.port());
|
options.setPort(restEndpoint.port());
|
||||||
@@ -118,6 +122,11 @@ public class JoobyService {
|
|||||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||||
options.setCompressionLevel(1);
|
options.setCompressionLevel(1);
|
||||||
|
|
||||||
|
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||||
|
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||||
|
// scenario
|
||||||
|
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||||
|
|
||||||
|
|
||||||
jooby.setServerOptions(options);
|
jooby.setServerOptions(options);
|
||||||
|
|
||||||
|
@@ -6,25 +6,36 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
|||||||
import org.eclipse.jetty.server.Server;
|
import org.eclipse.jetty.server.Server;
|
||||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||||
import org.eclipse.jetty.servlet.ServletHolder;
|
import org.eclipse.jetty.servlet.ServletHolder;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
|
|
||||||
public class MetricsServer {
|
public class MetricsServer {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public MetricsServer(ServiceConfiguration configuration) throws Exception {
|
public MetricsServer(ServiceConfiguration configuration) {
|
||||||
// If less than zero, we forego setting up a metrics server
|
// If less than zero, we forego setting up a metrics server
|
||||||
if (configuration.metricsPort() < 0)
|
if (configuration.metricsPort() < 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
try {
|
||||||
|
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||||
|
|
||||||
ServletContextHandler context = new ServletContextHandler();
|
ServletContextHandler context = new ServletContextHandler();
|
||||||
context.setContextPath("/");
|
context.setContextPath("/");
|
||||||
server.setHandler(context);
|
server.setHandler(context);
|
||||||
|
|
||||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||||
|
|
||||||
server.start();
|
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||||
|
|
||||||
|
server.start();
|
||||||
|
}
|
||||||
|
catch (Exception|NoSuchMethodError ex) {
|
||||||
|
logger.error("Failed to set up metrics server", ex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -35,21 +35,8 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forExpensiveRequest() {
|
|
||||||
return new RateLimiter(5, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter custom(int perMinute) {
|
public static RateLimiter custom(int perMinute) {
|
||||||
return new RateLimiter(perMinute, 60);
|
return new RateLimiter(4 * perMinute, perMinute);
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter forSpamBots() {
|
|
||||||
return new RateLimiter(120, 3600);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forLogin() {
|
|
||||||
return new RateLimiter(3, 15);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void cleanIdleBuckets() {
|
private void cleanIdleBuckets() {
|
||||||
@@ -62,7 +49,7 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Bucket createBucket() {
|
private Bucket createBucket() {
|
||||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
|
||||||
var bw = Bandwidth.classic(capacity, refill);
|
var bw = Bandwidth.classic(capacity, refill);
|
||||||
return Bucket.builder().addLimit(bw).build();
|
return Bucket.builder().addLimit(bw).build();
|
||||||
}
|
}
|
||||||
|
@@ -3,8 +3,16 @@
|
|||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="Console" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||||
<Filters>
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -13,15 +21,29 @@
|
|||||||
<Filters>
|
<Filters>
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
<SizeBasedTriggeringPolicy size="10MB" />
|
<SizeBasedTriggeringPolicy size="10MB" />
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="Console"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="LogToFile"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
|
@@ -1,10 +1,49 @@
|
|||||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||||
<Appenders>
|
<Appenders>
|
||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
<Filters>
|
<Filters>
|
||||||
|
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -17,14 +56,30 @@
|
|||||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
<AppenderRef ref="ConsoleError"/>
|
||||||
|
<AppenderRef ref="ConsoleFatal"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="LogToFile"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
|
@@ -1,15 +1,50 @@
|
|||||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||||
<Appenders>
|
<Appenders>
|
||||||
<Console name="Console" target="SYSTEM_OUT">
|
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||||
|
<Filters>
|
||||||
|
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</Console>
|
||||||
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
<Logger name="org.apache.pdfbox" level="ERROR" />
|
||||||
|
<Logger name="org.apache.fontbox.ttf" level="ERROR" />
|
||||||
<Root level="info">
|
<Root level="info">
|
||||||
<AppenderRef ref="Console"/>
|
<AppenderRef ref="ConsoleInfo"/>
|
||||||
<AppenderRef ref="LogToFile"/>
|
<AppenderRef ref="ConsoleWarn"/>
|
||||||
|
<AppenderRef ref="ConsoleError"/>
|
||||||
|
<AppenderRef ref="ConsoleFatal"/>
|
||||||
|
<AppenderRef ref="ProcessConsole"/>
|
||||||
</Root>
|
</Root>
|
||||||
</Loggers>
|
</Loggers>
|
||||||
</Configuration>
|
</Configuration>
|
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
|
|||||||
class ZkServiceRegistryTest {
|
class ZkServiceRegistryTest {
|
||||||
private static final int ZOOKEEPER_PORT = 2181;
|
private static final int ZOOKEEPER_PORT = 2181;
|
||||||
private static final GenericContainer<?> zookeeper =
|
private static final GenericContainer<?> zookeeper =
|
||||||
new GenericContainer<>("zookeeper:3.8.0")
|
new GenericContainer<>("zookeeper:3.8")
|
||||||
.withExposedPorts(ZOOKEEPER_PORT);
|
.withExposedPorts(ZOOKEEPER_PORT);
|
||||||
|
|
||||||
List<ZkServiceRegistry> registries = new ArrayList<>();
|
List<ZkServiceRegistry> registries = new ArrayList<>();
|
||||||
|
@@ -48,12 +48,13 @@ public class ExecutorExportClient {
|
|||||||
return msgId;
|
return msgId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
|
public void exportSampleData(int node, FileStorageId fid, int size, String ctFilter, String name) {
|
||||||
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
||||||
.forNode(node)
|
.forNode(node)
|
||||||
.run(RpcExportSampleData.newBuilder()
|
.run(RpcExportSampleData.newBuilder()
|
||||||
.setFileStorageId(fid.id())
|
.setFileStorageId(fid.id())
|
||||||
.setSize(size)
|
.setSize(size)
|
||||||
|
.setCtFilter(ctFilter)
|
||||||
.setName(name)
|
.setName(name)
|
||||||
.build());
|
.build());
|
||||||
}
|
}
|
||||||
|
@@ -100,6 +100,7 @@ message RpcExportSampleData {
|
|||||||
int64 fileStorageId = 1;
|
int64 fileStorageId = 1;
|
||||||
int32 size = 2;
|
int32 size = 2;
|
||||||
string name = 3;
|
string name = 3;
|
||||||
|
string ctFilter = 4;
|
||||||
}
|
}
|
||||||
message RpcDownloadSampleData {
|
message RpcDownloadSampleData {
|
||||||
string sampleSet = 1;
|
string sampleSet = 1;
|
||||||
|
@@ -19,6 +19,7 @@ dependencies {
|
|||||||
implementation project(':code:processes:crawling-process')
|
implementation project(':code:processes:crawling-process')
|
||||||
implementation project(':code:processes:live-crawling-process')
|
implementation project(':code:processes:live-crawling-process')
|
||||||
implementation project(':code:processes:loading-process')
|
implementation project(':code:processes:loading-process')
|
||||||
|
implementation project(':code:processes:ping-process')
|
||||||
implementation project(':code:processes:converting-process')
|
implementation project(':code:processes:converting-process')
|
||||||
implementation project(':code:processes:index-constructor-process')
|
implementation project(':code:processes:index-constructor-process')
|
||||||
|
|
||||||
@@ -37,6 +38,7 @@ dependencies {
|
|||||||
implementation project(':code:functions:link-graph:api')
|
implementation project(':code:functions:link-graph:api')
|
||||||
implementation project(':code:functions:live-capture:api')
|
implementation project(':code:functions:live-capture:api')
|
||||||
implementation project(':code:functions:search-query')
|
implementation project(':code:functions:search-query')
|
||||||
|
implementation project(':code:functions:nsfw-domain-filter')
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
|
|
||||||
implementation project(':code:processes:crawling-process:model')
|
implementation project(':code:processes:crawling-process:model')
|
||||||
|
@@ -6,11 +6,13 @@ import java.util.Set;
|
|||||||
|
|
||||||
public enum ExecutorActor {
|
public enum ExecutorActor {
|
||||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
|
||||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
@@ -20,6 +22,7 @@ public enum ExecutorActor {
|
|||||||
EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
MIGRATE_CRAWL_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
|
||||||
PROC_CONVERTER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
PROC_CONVERTER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
PROC_LOADER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
PROC_LOADER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||||
@@ -34,7 +37,8 @@ public enum ExecutorActor {
|
|||||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||||
UPDATE_RSS(NodeProfile.REALTIME);
|
UPDATE_RSS(NodeProfile.REALTIME)
|
||||||
|
;
|
||||||
|
|
||||||
public String id() {
|
public String id() {
|
||||||
return "fsm:" + name().toLowerCase();
|
return "fsm:" + name().toLowerCase();
|
||||||
|
@@ -49,6 +49,7 @@ public class ExecutorActorControlService {
|
|||||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||||
RestoreBackupActor restoreBackupActor,
|
RestoreBackupActor restoreBackupActor,
|
||||||
ConverterMonitorActor converterMonitorFSM,
|
ConverterMonitorActor converterMonitorFSM,
|
||||||
|
PingMonitorActor pingMonitorActor,
|
||||||
CrawlerMonitorActor crawlerMonitorActor,
|
CrawlerMonitorActor crawlerMonitorActor,
|
||||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||||
LoaderMonitorActor loaderMonitor,
|
LoaderMonitorActor loaderMonitor,
|
||||||
@@ -66,7 +67,9 @@ public class ExecutorActorControlService {
|
|||||||
DownloadSampleActor downloadSampleActor,
|
DownloadSampleActor downloadSampleActor,
|
||||||
ScrapeFeedsActor scrapeFeedsActor,
|
ScrapeFeedsActor scrapeFeedsActor,
|
||||||
ExecutorActorStateMachines stateMachines,
|
ExecutorActorStateMachines stateMachines,
|
||||||
|
MigrateCrawlDataActor migrateCrawlDataActor,
|
||||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||||
|
UpdateNsfwFiltersActor updateNsfwFiltersActor,
|
||||||
UpdateRssActor updateRssActor) throws SQLException {
|
UpdateRssActor updateRssActor) throws SQLException {
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
this.eventLog = baseServiceParams.eventLog;
|
this.eventLog = baseServiceParams.eventLog;
|
||||||
@@ -87,6 +90,7 @@ public class ExecutorActorControlService {
|
|||||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||||
|
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||||
|
|
||||||
@@ -107,6 +111,9 @@ public class ExecutorActorControlService {
|
|||||||
register(ExecutorActor.SCRAPE_FEEDS, scrapeFeedsActor);
|
register(ExecutorActor.SCRAPE_FEEDS, scrapeFeedsActor);
|
||||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||||
|
|
||||||
|
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||||
|
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
|
||||||
|
|
||||||
if (serviceConfiguration.node() == 1) {
|
if (serviceConfiguration.node() == 1) {
|
||||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,178 @@
|
|||||||
|
package nu.marginalia.actor.proc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.actor.state.Resume;
|
||||||
|
import nu.marginalia.actor.state.Terminal;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.mqapi.ping.PingRequest;
|
||||||
|
import nu.marginalia.process.ProcessService;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class PingMonitorActor extends RecordActorPrototype {
|
||||||
|
|
||||||
|
private final MqPersistence persistence;
|
||||||
|
private final ProcessService processService;
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
public static final int MAX_ATTEMPTS = 3;
|
||||||
|
private final String inboxName;
|
||||||
|
private final ProcessService.ProcessId processId;
|
||||||
|
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||||
|
private final int node;
|
||||||
|
private final Gson gson;
|
||||||
|
|
||||||
|
public record Initial() implements ActorStep {}
|
||||||
|
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||||
|
public record Monitor(int errorAttempts) implements ActorStep {}
|
||||||
|
@Resume(behavior = ActorResumeBehavior.RESTART)
|
||||||
|
public record Run(int attempts) implements ActorStep {}
|
||||||
|
@Terminal
|
||||||
|
public record Aborted() implements ActorStep {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
return switch (self) {
|
||||||
|
case Initial i -> {
|
||||||
|
PingRequest request = new PingRequest();
|
||||||
|
|
||||||
|
persistence.sendNewMessage(inboxName, null, null,
|
||||||
|
"PingRequest",
|
||||||
|
gson.toJson(request),
|
||||||
|
null);
|
||||||
|
|
||||||
|
yield new Monitor(0);
|
||||||
|
}
|
||||||
|
case Monitor(int errorAttempts) -> {
|
||||||
|
for (;;) {
|
||||||
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
|
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||||
|
synchronized (processId) {
|
||||||
|
processId.wait(5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||||
|
yield new Monitor(0);
|
||||||
|
}
|
||||||
|
// else continue
|
||||||
|
} else {
|
||||||
|
// Special: Associate this thread with the message so that we can get tracking
|
||||||
|
MqMessageHandlerRegistry.register(messages.getFirst().msgId());
|
||||||
|
|
||||||
|
yield new Run(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case Run(int attempts) -> {
|
||||||
|
try {
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
var exec = new TaskExecution();
|
||||||
|
long endTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
if (exec.isError()) {
|
||||||
|
if (attempts < MAX_ATTEMPTS)
|
||||||
|
yield new Run(attempts + 1);
|
||||||
|
else
|
||||||
|
yield new Error();
|
||||||
|
}
|
||||||
|
else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) {
|
||||||
|
// To avoid boot loops, we transition to error if the process
|
||||||
|
// didn't run for longer than 1 seconds. This might happen if
|
||||||
|
// the process crashes before it can reach the heartbeat and inbox
|
||||||
|
// stages of execution. In this case it would not report having acted
|
||||||
|
// on its message, and the process would be restarted forever without
|
||||||
|
// the attempts counter incrementing.
|
||||||
|
yield new Error("Process terminated within 1 seconds of starting");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (InterruptedException ex) {
|
||||||
|
// We get this exception when the process is cancelled by the user
|
||||||
|
|
||||||
|
processService.kill(processId);
|
||||||
|
setCurrentMessageToDead();
|
||||||
|
|
||||||
|
yield new Aborted();
|
||||||
|
}
|
||||||
|
|
||||||
|
yield new Monitor(attempts);
|
||||||
|
}
|
||||||
|
default -> new Error();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public String describe() {
|
||||||
|
return "Spawns a(n) " + processId + " process and monitors its inbox for messages";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public PingMonitorActor(Gson gson,
|
||||||
|
ServiceConfiguration configuration,
|
||||||
|
MqPersistence persistence,
|
||||||
|
ProcessService processService) throws SQLException {
|
||||||
|
super(gson);
|
||||||
|
this.gson = gson;
|
||||||
|
this.node = configuration.node();
|
||||||
|
this.persistence = persistence;
|
||||||
|
this.processService = processService;
|
||||||
|
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||||
|
this.processId = ProcessService.ProcessId.PING;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Sets the message to dead in the database to avoid
|
||||||
|
* the service respawning on the same task when we
|
||||||
|
* re-enable this actor */
|
||||||
|
private void setCurrentMessageToDead() {
|
||||||
|
try {
|
||||||
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
|
if (messages.isEmpty()) // Possibly a race condition where the task is already finished
|
||||||
|
return;
|
||||||
|
|
||||||
|
var theMessage = messages.iterator().next();
|
||||||
|
persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD);
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Tried but failed to set the message for " + processId + " to dead", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Encapsulates the execution of the process in a separate thread so that
|
||||||
|
* we can interrupt the thread if the process is cancelled */
|
||||||
|
private class TaskExecution {
|
||||||
|
private final AtomicBoolean error = new AtomicBoolean(false);
|
||||||
|
public TaskExecution() throws ExecutionException, InterruptedException {
|
||||||
|
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||||
|
executorService.submit(() -> {
|
||||||
|
try {
|
||||||
|
processService.trigger(processId);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("Error in triggering process", e);
|
||||||
|
error.set(true);
|
||||||
|
}
|
||||||
|
}).get(); // Wait for the process to start
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isError() {
|
||||||
|
return error.get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
|
|||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final NodeConfigurationService nodeConfigurationService;
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public UpdateRssActor(Gson gson,
|
public UpdateRssActor(Gson gson,
|
||||||
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
case UpdateRefresh(int count, long msgId) -> {
|
case UpdateRefresh(int count, long msgId) -> {
|
||||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||||
if (msg == null) {
|
if (msg == null) {
|
||||||
// Retry the update
|
logger.warn("UpdateRefresh is taking a very long time");
|
||||||
yield new Error("Failed to update feeds: message not found");
|
yield new UpdateRefresh(count, msgId);
|
||||||
} else if (msg.state() != MqMessageState.OK) {
|
} else if (msg.state() != MqMessageState.OK) {
|
||||||
// Retry the update
|
// Retry the update
|
||||||
yield new Error("Failed to update feeds: " + msg.state());
|
yield new Error("Failed to update feeds: " + msg.state());
|
||||||
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
case UpdateClean(long msgId) -> {
|
case UpdateClean(long msgId) -> {
|
||||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||||
if (msg == null) {
|
if (msg == null) {
|
||||||
// Retry the update
|
logger.warn("UpdateClean is taking a very long time");
|
||||||
yield new Error("Failed to update feeds: message not found");
|
yield new UpdateClean(msgId);
|
||||||
} else if (msg.state() != MqMessageState.OK) {
|
} else if (msg.state() != MqMessageState.OK) {
|
||||||
// Retry the update
|
// Retry the update
|
||||||
yield new Error("Failed to update feeds: " + msg.state());
|
yield new Error("Failed to update feeds: " + msg.state());
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
|||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.actor.state.Resume;
|
import nu.marginalia.actor.state.Resume;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
private final ServiceEventLog eventLog;
|
private final ServiceEventLog eventLog;
|
||||||
|
private final ServiceHeartbeat heartbeat;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@Resume(behavior = ActorResumeBehavior.ERROR)
|
@Resume(behavior = ActorResumeBehavior.ERROR)
|
||||||
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
Files.deleteIfExists(Path.of(tarFileName));
|
Files.deleteIfExists(Path.of(tarFileName));
|
||||||
|
|
||||||
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
|
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
|
||||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
|
||||||
is.transferTo(os);
|
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
|
||||||
|
long size = urlConnection.getContentLengthLong();
|
||||||
|
byte[] buffer = new byte[8192];
|
||||||
|
|
||||||
|
try (var is = new BufferedInputStream(urlConnection.getInputStream());
|
||||||
|
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||||
|
long copiedSize = 0;
|
||||||
|
|
||||||
|
while (copiedSize < size) {
|
||||||
|
int read = is.read(buffer);
|
||||||
|
|
||||||
|
if (read < 0) // We've been promised a file of length 'size'
|
||||||
|
throw new IOException("Unexpected end of stream");
|
||||||
|
|
||||||
|
os.write(buffer, 0, read);
|
||||||
|
copiedSize += read;
|
||||||
|
|
||||||
|
// Update progress bar
|
||||||
|
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
||||||
logger.error("Error downloading sample", ex);
|
logger.error("Error downloading sample", ex);
|
||||||
yield new Error();
|
yield new Error();
|
||||||
}
|
}
|
||||||
|
finally {
|
||||||
|
urlConnection.disconnect();
|
||||||
|
}
|
||||||
|
|
||||||
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
||||||
yield new Extract(fileStorageId, tarFileName);
|
yield new Extract(fileStorageId, tarFileName);
|
||||||
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
|||||||
@Inject
|
@Inject
|
||||||
public DownloadSampleActor(Gson gson,
|
public DownloadSampleActor(Gson gson,
|
||||||
FileStorageService storageService,
|
FileStorageService storageService,
|
||||||
ServiceEventLog eventLog)
|
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
|
||||||
{
|
{
|
||||||
super(gson);
|
super(gson);
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
this.eventLog = eventLog;
|
this.eventLog = eventLog;
|
||||||
|
this.heartbeat = heartbeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -26,32 +26,32 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
private final MqOutbox exportTasksOutbox;
|
private final MqOutbox exportTasksOutbox;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
public record Export(FileStorageId crawlId, int size, String ctFilter, String name) implements ActorStep {}
|
||||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
|
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) implements ActorStep {
|
||||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name, String ctFilter) {
|
||||||
this(crawlId, destId, size, name, -1);
|
this(crawlId, destId, size, name, ctFilter,-1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
case Export(FileStorageId crawlId, int size, String ctFilter, String name) -> {
|
||||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||||
"crawl-sample-export",
|
"crawl-sample-export",
|
||||||
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
||||||
);
|
);
|
||||||
|
|
||||||
if (storage == null) yield new Error("Bad storage id");
|
if (storage == null) yield new Error("Bad storage id");
|
||||||
yield new Run(crawlId, storage.id(), size, name);
|
yield new Run(crawlId, storage.id(), size, ctFilter, name);
|
||||||
}
|
}
|
||||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
|
case Run(FileStorageId crawlId, FileStorageId destId, int size, String ctFilter, String name, long msgId) when msgId < 0 -> {
|
||||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||||
|
|
||||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
|
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, ctFilter, size, name));
|
||||||
yield new Run(crawlId, destId, size, name, newMsgId);
|
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(_, FileStorageId destId, _, _, long msgId) -> {
|
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
@@ -70,7 +70,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String describe() {
|
public String describe() {
|
||||||
return "Export RSS/Atom feeds from crawl data";
|
return "Export sample crawl data";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
|
@@ -0,0 +1,150 @@
|
|||||||
|
package nu.marginalia.actor.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import jakarta.inject.Inject;
|
||||||
|
import jakarta.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
|
import nu.marginalia.process.log.WorkLog;
|
||||||
|
import nu.marginalia.process.log.WorkLogEntry;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
|
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
|
import org.apache.logging.log4j.util.Strings;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class MigrateCrawlDataActor extends RecordActorPrototype {
|
||||||
|
|
||||||
|
private final FileStorageService fileStorageService;
|
||||||
|
private final ServiceHeartbeat serviceHeartbeat;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService, ServiceHeartbeat serviceHeartbeat) {
|
||||||
|
super(gson);
|
||||||
|
|
||||||
|
this.fileStorageService = fileStorageService;
|
||||||
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
|
}
|
||||||
|
|
||||||
|
public record Run(long fileStorageId) implements ActorStep {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
return switch (self) {
|
||||||
|
case Run(long fileStorageId) -> {
|
||||||
|
|
||||||
|
FileStorage storage = fileStorageService.getStorage(FileStorageId.of(fileStorageId));
|
||||||
|
Path root = storage.asPath();
|
||||||
|
|
||||||
|
Path crawlerLog = root.resolve("crawler.log");
|
||||||
|
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
||||||
|
|
||||||
|
int totalEntries = WorkLog.countEntries(crawlerLog);
|
||||||
|
|
||||||
|
try (WorkLog workLog = new WorkLog(newCrawlerLog);
|
||||||
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Migrating")
|
||||||
|
) {
|
||||||
|
int entryIdx = 0;
|
||||||
|
|
||||||
|
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
||||||
|
|
||||||
|
final WorkLogEntry entry = item.getKey();
|
||||||
|
final Path inputPath = item.getValue();
|
||||||
|
|
||||||
|
Path outputPath = inputPath;
|
||||||
|
heartbeat.progress("Migrating" + inputPath.getFileName(), entryIdx++, totalEntries);
|
||||||
|
|
||||||
|
if (inputPath.toString().endsWith(".parquet")) {
|
||||||
|
String domain = entry.id();
|
||||||
|
String id = Integer.toHexString(domain.hashCode());
|
||||||
|
|
||||||
|
outputPath = CrawlerOutputFile.createSlopPath(root, id, domain);
|
||||||
|
|
||||||
|
if (Files.exists(inputPath)) {
|
||||||
|
try {
|
||||||
|
SlopCrawlDataRecord.convertFromParquet(inputPath, outputPath);
|
||||||
|
Files.deleteIfExists(inputPath);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
outputPath = inputPath; // don't update the work log on error
|
||||||
|
logger.error("Failed to convert " + inputPath, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!Files.exists(inputPath) && !Files.exists(outputPath)) {
|
||||||
|
// if the input file is missing, and the output file is missing, we just write the log
|
||||||
|
// record identical to the old one
|
||||||
|
outputPath = inputPath;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write a log entry for the (possibly) converted file
|
||||||
|
workLog.setJobToFinished(entry.id(), outputPath.toString(), entry.cnt());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
||||||
|
Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
Files.move(newCrawlerLog, crawlerLog);
|
||||||
|
|
||||||
|
yield new End();
|
||||||
|
}
|
||||||
|
default -> new Error();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class CrawlDataLocator implements Function<WorkLogEntry, Optional<Map.Entry<WorkLogEntry, Path>>> {
|
||||||
|
|
||||||
|
private final Path crawlRootDir;
|
||||||
|
|
||||||
|
CrawlDataLocator(Path crawlRootDir) {
|
||||||
|
this.crawlRootDir = crawlRootDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<Map.Entry<WorkLogEntry, Path>> apply(WorkLogEntry entry) {
|
||||||
|
var path = getCrawledFilePath(crawlRootDir, entry.path());
|
||||||
|
|
||||||
|
if (!Files.exists(path)) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return Optional.of(Map.entry(entry, path));
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path getCrawledFilePath(Path crawlDir, String fileName) {
|
||||||
|
int sp = fileName.lastIndexOf('/');
|
||||||
|
|
||||||
|
// Normalize the filename
|
||||||
|
if (sp >= 0 && sp + 1< fileName.length())
|
||||||
|
fileName = fileName.substring(sp + 1);
|
||||||
|
if (fileName.length() < 4)
|
||||||
|
fileName = Strings.repeat("0", 4 - fileName.length()) + fileName;
|
||||||
|
|
||||||
|
String sp1 = fileName.substring(0, 2);
|
||||||
|
String sp2 = fileName.substring(2, 4);
|
||||||
|
return crawlDir.resolve(sp1).resolve(sp2).resolve(fileName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String describe() {
|
||||||
|
return "Migrates crawl data to the latest format";
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,53 @@
|
|||||||
|
package nu.marginalia.actor.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||||
|
private final ServiceConfiguration serviceConfiguration;
|
||||||
|
private final NsfwDomainFilter nsfwDomainFilter;
|
||||||
|
|
||||||
|
public record Initial() implements ActorStep {}
|
||||||
|
public record Run() implements ActorStep {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
return switch(self) {
|
||||||
|
case Initial() -> {
|
||||||
|
if (serviceConfiguration.node() != 1) {
|
||||||
|
yield new Error("This actor can only run on node 1");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
yield new Run();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case Run() -> {
|
||||||
|
nsfwDomainFilter.fetchLists();
|
||||||
|
yield new End();
|
||||||
|
}
|
||||||
|
default -> new Error();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String describe() {
|
||||||
|
return "Sync NSFW filters";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public UpdateNsfwFiltersActor(Gson gson,
|
||||||
|
ServiceConfiguration serviceConfiguration,
|
||||||
|
NsfwDomainFilter nsfwDomainFilter)
|
||||||
|
{
|
||||||
|
super(gson);
|
||||||
|
this.serviceConfiguration = serviceConfiguration;
|
||||||
|
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -49,6 +49,7 @@ public class ExecutorExportGrpcService
|
|||||||
new ExportSampleDataActor.Export(
|
new ExportSampleDataActor.Export(
|
||||||
FileStorageId.of(request.getFileStorageId()),
|
FileStorageId.of(request.getFileStorageId()),
|
||||||
request.getSize(),
|
request.getSize(),
|
||||||
|
request.getCtFilter(),
|
||||||
request.getName()
|
request.getName()
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.crawl.CrawlerMain;
|
|||||||
import nu.marginalia.index.IndexConstructorMain;
|
import nu.marginalia.index.IndexConstructorMain;
|
||||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||||
import nu.marginalia.loading.LoaderMain;
|
import nu.marginalia.loading.LoaderMain;
|
||||||
|
import nu.marginalia.ping.PingMain;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.server.BaseServiceParams;
|
import nu.marginalia.service.server.BaseServiceParams;
|
||||||
import nu.marginalia.task.ExportTasksMain;
|
import nu.marginalia.task.ExportTasksMain;
|
||||||
@@ -41,6 +42,7 @@ public class ProcessService {
|
|||||||
return switch (id) {
|
return switch (id) {
|
||||||
case "converter" -> ProcessId.CONVERTER;
|
case "converter" -> ProcessId.CONVERTER;
|
||||||
case "crawler" -> ProcessId.CRAWLER;
|
case "crawler" -> ProcessId.CRAWLER;
|
||||||
|
case "ping" -> ProcessId.PING;
|
||||||
case "loader" -> ProcessId.LOADER;
|
case "loader" -> ProcessId.LOADER;
|
||||||
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||||
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||||
@@ -50,6 +52,7 @@ public class ProcessService {
|
|||||||
|
|
||||||
public enum ProcessId {
|
public enum ProcessId {
|
||||||
CRAWLER(CrawlerMain.class),
|
CRAWLER(CrawlerMain.class),
|
||||||
|
PING(PingMain.class),
|
||||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||||
CONVERTER(ConverterMain.class),
|
CONVERTER(ConverterMain.class),
|
||||||
LOADER(LoaderMain.class),
|
LOADER(LoaderMain.class),
|
||||||
@@ -68,6 +71,7 @@ public class ProcessService {
|
|||||||
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||||
|
case PING -> "PING_PROCESS_OPTS";
|
||||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||||
};
|
};
|
||||||
|
@@ -27,10 +27,12 @@ public class DbBrowseDomainsRandom {
|
|||||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||||
|
|
||||||
final String q = """
|
final String q = """
|
||||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
SELECT EC_RANDOM_DOMAINS.DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||||
FROM EC_RANDOM_DOMAINS
|
FROM EC_RANDOM_DOMAINS
|
||||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||||
|
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION DAI ON DAI.DOMAIN_ID=EC_RANDOM_DOMAINS.DOMAIN_ID
|
||||||
WHERE STATE<2
|
WHERE STATE<2
|
||||||
|
AND SERVER_AVAILABLE
|
||||||
AND DOMAIN_SET=?
|
AND DOMAIN_SET=?
|
||||||
AND DOMAIN_ALIAS IS NULL
|
AND DOMAIN_ALIAS IS NULL
|
||||||
ORDER BY RAND()
|
ORDER BY RAND()
|
||||||
|
47
code/functions/favicon/api/build.gradle
Normal file
47
code/functions/favicon/api/build.gradle
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id "com.google.protobuf" version "0.9.4"
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
jar.archiveBaseName = 'favicon-api'
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation libs.gson
|
||||||
|
implementation libs.bundles.protobuf
|
||||||
|
implementation libs.guava
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,39 @@
|
|||||||
|
package nu.marginalia.api.favicon;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
|
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||||
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class FaviconClient {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(FaviconClient.class);
|
||||||
|
|
||||||
|
private final GrpcMultiNodeChannelPool<FaviconAPIGrpc.FaviconAPIBlockingStub> channelPool;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FaviconClient(GrpcChannelPoolFactory factory) {
|
||||||
|
this.channelPool = factory.createMulti(
|
||||||
|
ServiceKey.forGrpcApi(FaviconAPIGrpc.class, ServicePartition.multi()),
|
||||||
|
FaviconAPIGrpc::newBlockingStub);
|
||||||
|
}
|
||||||
|
|
||||||
|
public record FaviconData(byte[] bytes, String contentType) {}
|
||||||
|
|
||||||
|
|
||||||
|
public Optional<FaviconData> getFavicon(String domain, int node) {
|
||||||
|
RpcFaviconResponse rsp = channelPool.call(FaviconAPIGrpc.FaviconAPIBlockingStub::getFavicon)
|
||||||
|
.forNode(node)
|
||||||
|
.run(RpcFaviconRequest.newBuilder().setDomain(domain).build());
|
||||||
|
|
||||||
|
if (rsp.getData().isEmpty())
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
return Optional.of(new FaviconData(rsp.getData().toByteArray(), rsp.getContentType()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
syntax="proto3";
|
||||||
|
package marginalia.api.favicon;
|
||||||
|
|
||||||
|
option java_package="nu.marginalia.api.favicon";
|
||||||
|
option java_multiple_files=true;
|
||||||
|
|
||||||
|
service FaviconAPI {
|
||||||
|
/** Fetches information about a domain. */
|
||||||
|
rpc getFavicon(RpcFaviconRequest) returns (RpcFaviconResponse) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFaviconRequest {
|
||||||
|
string domain = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFaviconResponse {
|
||||||
|
string domain = 1;
|
||||||
|
bytes data = 2;
|
||||||
|
string contentType = 3;
|
||||||
|
}
|
49
code/functions/favicon/build.gradle
Normal file
49
code/functions/favicon/build.gradle
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id 'application'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:functions:favicon:api')
|
||||||
|
implementation project(':code:processes:crawling-process')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.guava
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation dependencies.create(libs.spark.get()) {
|
||||||
|
exclude group: 'org.eclipse.jetty'
|
||||||
|
}
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.functions.favicon;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.protobuf.ByteString;
|
||||||
|
import io.grpc.stub.StreamObserver;
|
||||||
|
import nu.marginalia.api.favicon.FaviconAPIGrpc;
|
||||||
|
import nu.marginalia.api.favicon.RpcFaviconRequest;
|
||||||
|
import nu.marginalia.api.favicon.RpcFaviconResponse;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implements DiscoverableService {
|
||||||
|
private final DomainStateDb domainStateDb;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FaviconGrpcService(DomainStateDb domainStateDb) {
|
||||||
|
this.domainStateDb = domainStateDb;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean shouldRegisterService() {
|
||||||
|
return domainStateDb.isAvailable();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
|
||||||
|
Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());
|
||||||
|
|
||||||
|
RpcFaviconResponse response;
|
||||||
|
if (icon.isEmpty()) {
|
||||||
|
response = RpcFaviconResponse.newBuilder().build();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var iconRecord = icon.get();
|
||||||
|
response = RpcFaviconResponse.newBuilder()
|
||||||
|
.setContentType(iconRecord.contentType())
|
||||||
|
.setDomain(request.getDomain())
|
||||||
|
.setData(ByteString.copyFrom(iconRecord.imageData()))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
responseObserver.onNext(response);
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
}
|
@@ -22,17 +22,20 @@ dependencies {
|
|||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:libraries:blocking-thread-pool')
|
implementation project(':code:libraries:blocking-thread-pool')
|
||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
implementation project(':code:libraries:domain-lock')
|
||||||
|
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
|
implementation project(':third-party:rssreader')
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation project(':third-party:rssreader')
|
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
|
implementation libs.slop
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
|
implementation libs.wiremock
|
||||||
|
|
||||||
implementation libs.prometheus
|
implementation libs.prometheus
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
@@ -55,8 +58,6 @@ dependencies {
|
|||||||
implementation libs.bundles.gson
|
implementation libs.bundles.gson
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
@@ -0,0 +1,126 @@
|
|||||||
|
package nu.marginalia.domsample;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import jakarta.inject.Named;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
|
import nu.marginalia.livecapture.BrowserlessClient;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
public class DomSampleService {
|
||||||
|
private final DomSampleDb db;
|
||||||
|
private final HikariDataSource mariadbDataSource;
|
||||||
|
private final URI browserlessURI;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomSampleService(DomSampleDb db,
|
||||||
|
HikariDataSource mariadbDataSource,
|
||||||
|
@Named("browserless-uri") String browserlessAddress,
|
||||||
|
ServiceConfiguration serviceConfiguration)
|
||||||
|
throws URISyntaxException
|
||||||
|
{
|
||||||
|
this.db = db;
|
||||||
|
this.mariadbDataSource = mariadbDataSource;
|
||||||
|
|
||||||
|
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||||
|
logger.warn("Live capture service will not run");
|
||||||
|
browserlessURI = null;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
browserlessURI = new URI(browserlessAddress);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void start() {
|
||||||
|
if (browserlessURI == null) {
|
||||||
|
logger.warn("DomSampleService is not enabled due to missing browserless URI or multi-node configuration");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon().start(this::run);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void syncDomains() {
|
||||||
|
Set<String> dbDomains = new HashSet<>();
|
||||||
|
|
||||||
|
logger.info("Fetching domains from database...");
|
||||||
|
|
||||||
|
try (var conn = mariadbDataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_NAME
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE NODE_AFFINITY>0
|
||||||
|
""")
|
||||||
|
) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
dbDomains.add(rs.getString("DOMAIN_NAME"));
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("Failed to sync domains", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Found {} domains in database", dbDomains.size());
|
||||||
|
|
||||||
|
db.syncDomains(dbDomains);
|
||||||
|
|
||||||
|
logger.info("Synced domains to sqlite");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
|
||||||
|
while (!Thread.currentThread().isInterrupted()) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Grace sleep in case we're operating on an empty domain list
|
||||||
|
TimeUnit.SECONDS.sleep(15);
|
||||||
|
|
||||||
|
syncDomains();
|
||||||
|
var domains = db.getScheduledDomains();
|
||||||
|
|
||||||
|
for (var domain : domains) {
|
||||||
|
updateDomain(client, domain);
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
logger.info("DomSampleService interrupted, stopping...");
|
||||||
|
return;
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error in DomSampleService run loop", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateDomain(BrowserlessClient client, String domain) {
|
||||||
|
var rootUrl = "https://" + domain + "/";
|
||||||
|
try {
|
||||||
|
var content = client.annotatedContent(rootUrl, new BrowserlessClient.GotoOptions("load", Duration.ofSeconds(10).toMillis()));
|
||||||
|
|
||||||
|
if (content.isPresent()) {
|
||||||
|
db.saveSample(domain, rootUrl, content.get());
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Failed to process domain: " + domain, e);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
db.flagDomainAsFetched(domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,174 @@
|
|||||||
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class DomSampleDb implements AutoCloseable {
|
||||||
|
private static final String dbFileName = "dom-sample.db";
|
||||||
|
private final Connection connection;
|
||||||
|
|
||||||
|
public DomSampleDb() throws SQLException{
|
||||||
|
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||||
|
}
|
||||||
|
|
||||||
|
public DomSampleDb(Path dbPath) throws SQLException {
|
||||||
|
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||||
|
|
||||||
|
connection = DriverManager.getConnection(dbUrl);
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||||
|
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||||
|
stmt.execute("PRAGMA journal_mode=WAL");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void syncDomains(Set<String> domains) {
|
||||||
|
Set<String> currentDomains = new HashSet<>();
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
currentDomains.add(rs.getString("domain"));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to sync domains", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> toRemove = new HashSet<>(currentDomains);
|
||||||
|
Set<String> toAdd = new HashSet<>(domains);
|
||||||
|
|
||||||
|
toRemove.removeAll(domains);
|
||||||
|
toAdd.removeAll(currentDomains);
|
||||||
|
|
||||||
|
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
|
||||||
|
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
|
||||||
|
) {
|
||||||
|
for (String domain : toRemove) {
|
||||||
|
removeStmt.setString(1, domain);
|
||||||
|
removeStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String domain : toAdd) {
|
||||||
|
addStmt.setString(1, domain);
|
||||||
|
addStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to remove domains", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getScheduledDomains() {
|
||||||
|
List<String> domains = new ArrayList<>();
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
domains.add(rs.getString("domain"));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to get scheduled domains", e);
|
||||||
|
}
|
||||||
|
return domains;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void flagDomainAsFetched(String domain) {
|
||||||
|
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException("Failed to flag domain as fetched", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||||
|
|
||||||
|
public List<Sample> getSamples(String domain) throws SQLException {
|
||||||
|
List<Sample> samples = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT url, sample, requests, accepted_popover
|
||||||
|
FROM samples
|
||||||
|
WHERE domain = ?
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
samples.add(
|
||||||
|
new Sample(
|
||||||
|
rs.getString("url"),
|
||||||
|
domain,
|
||||||
|
rs.getString("sample"),
|
||||||
|
rs.getString("requests"),
|
||||||
|
rs.getBoolean("accepted_popover")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||||
|
var doc = Jsoup.parse(rawContent);
|
||||||
|
|
||||||
|
var networkRequests = doc.getElementById("marginalia-network-requests");
|
||||||
|
|
||||||
|
boolean acceptedPopover = false;
|
||||||
|
|
||||||
|
StringBuilder requestTsv = new StringBuilder();
|
||||||
|
if (networkRequests != null) {
|
||||||
|
|
||||||
|
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
|
||||||
|
|
||||||
|
for (var request : networkRequests.getElementsByClass("network-request")) {
|
||||||
|
String method = request.attr("data-method");
|
||||||
|
String urlAttr = request.attr("data-url");
|
||||||
|
String timestamp = request.attr("data-timestamp");
|
||||||
|
|
||||||
|
requestTsv
|
||||||
|
.append(method)
|
||||||
|
.append('\t')
|
||||||
|
.append(timestamp)
|
||||||
|
.append('\t')
|
||||||
|
.append(urlAttr.replace('\n', ' '))
|
||||||
|
.append("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
networkRequests.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
doc.body().removeAttr("id");
|
||||||
|
|
||||||
|
String sample = doc.html();
|
||||||
|
|
||||||
|
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE
|
||||||
|
INTO samples (domain, url, sample, requests, accepted_popover)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.setString(2, url);
|
||||||
|
stmt.setString(3, sample);
|
||||||
|
stmt.setString(4, requests);
|
||||||
|
stmt.setBoolean(5, acceptedPopover);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws SQLException {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
}
|
@@ -1,21 +1,28 @@
|
|||||||
package nu.marginalia.livecapture;
|
package nu.marginalia.livecapture;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.net.URLEncoder;
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
/** Client for local browserless.io API */
|
/** Client for local browserless.io API */
|
||||||
public class BrowserlessClient implements AutoCloseable {
|
public class BrowserlessClient implements AutoCloseable {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
||||||
|
private static final String BROWSERLESS_TOKEN = System.getProperty("live-capture.browserless-token", "BROWSERLESS_TOKEN");
|
||||||
|
|
||||||
private final HttpClient httpClient = HttpClient.newBuilder()
|
private final HttpClient httpClient = HttpClient.newBuilder()
|
||||||
.version(HttpClient.Version.HTTP_1_1)
|
.version(HttpClient.Version.HTTP_1_1)
|
||||||
@@ -25,18 +32,21 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
private final URI browserlessURI;
|
private final URI browserlessURI;
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private final String userAgent = WmsaHome.getUserAgent().uaString();
|
||||||
|
|
||||||
public BrowserlessClient(URI browserlessURI) {
|
public BrowserlessClient(URI browserlessURI) {
|
||||||
this.browserlessURI = browserlessURI;
|
this.browserlessURI = browserlessURI;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", url,
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
"gotoOptions", gotoOptions
|
"gotoOptions", gotoOptions
|
||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(browserlessURI.resolve("/content"))
|
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
@@ -47,10 +57,46 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
|
|
||||||
if (rsp.statusCode() >= 300) {
|
if (rsp.statusCode() >= 300) {
|
||||||
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
|
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
|
||||||
return null;
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
return rsp.body();
|
return Optional.of(rsp.body());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
|
||||||
|
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
|
||||||
|
*/
|
||||||
|
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||||
|
Map<String, Object> requestData = Map.of(
|
||||||
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
|
"gotoOptions", gotoOptions,
|
||||||
|
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Launch parameters for the browserless instance to load the extension
|
||||||
|
Map<String, Object> launchParameters = Map.of(
|
||||||
|
"args", List.of("--load-extension=/dom-export")
|
||||||
|
);
|
||||||
|
|
||||||
|
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
var request = HttpRequest.newBuilder()
|
||||||
|
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
|
||||||
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
|
gson.toJson(requestData)
|
||||||
|
))
|
||||||
|
.header("Content-type", "application/json")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
|
||||||
|
if (rsp.statusCode() >= 300) {
|
||||||
|
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(rsp.body());
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||||
@@ -58,12 +104,13 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
|
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", url,
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
"options", screenshotOptions,
|
"options", screenshotOptions,
|
||||||
"gotoOptions", gotoOptions
|
"gotoOptions", gotoOptions
|
||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(browserlessURI.resolve("/screenshot"))
|
.uri(browserlessURI.resolve("/screenshot?token="+BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
@@ -82,7 +129,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() {
|
||||||
httpClient.shutdownNow();
|
httpClient.shutdownNow();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -126,7 +126,6 @@ public class LiveCaptureGrpcService
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
EdgeDomain domain = domainNameOpt.get();
|
EdgeDomain domain = domainNameOpt.get();
|
||||||
String domainNameStr = domain.toString();
|
|
||||||
|
|
||||||
if (!isValidDomainForCapture(domain)) {
|
if (!isValidDomainForCapture(domain)) {
|
||||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.rss.model;
|
package nu.marginalia.rss.model;
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
import nu.marginalia.rss.svc.SimpleFeedParser;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@@ -18,37 +18,33 @@ public record FeedItem(String title,
|
|||||||
public static final int MAX_DESC_LENGTH = 255;
|
public static final int MAX_DESC_LENGTH = 255;
|
||||||
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
||||||
|
|
||||||
public static FeedItem fromItem(Item item, boolean keepFragment) {
|
public static FeedItem fromItem(SimpleFeedParser.ItemData item, boolean keepFragment) {
|
||||||
String title = item.getTitle().orElse("");
|
String title = item.title();
|
||||||
String date = getItemDate(item);
|
String date = getItemDate(item);
|
||||||
String description = getItemDescription(item);
|
String description = getItemDescription(item);
|
||||||
String url;
|
String url;
|
||||||
|
|
||||||
if (keepFragment || item.getLink().isEmpty()) {
|
if (keepFragment) {
|
||||||
url = item.getLink().orElse("");
|
url = item.url();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
try {
|
try {
|
||||||
String link = item.getLink().get();
|
String link = item.url();
|
||||||
var linkUri = new URI(link);
|
var linkUri = new URI(link);
|
||||||
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
||||||
url = cleanUri.toString();
|
url = cleanUri.toString();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
// fallback to original link if we can't clean it, this is not a very important step
|
// fallback to original link if we can't clean it, this is not a very important step
|
||||||
url = item.getLink().get();
|
url = item.url();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FeedItem(title, date, description, url);
|
return new FeedItem(title, date, description, url);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getItemDescription(Item item) {
|
private static String getItemDescription(SimpleFeedParser.ItemData item) {
|
||||||
Optional<String> description = item.getDescription();
|
String rawDescription = item.description();
|
||||||
if (description.isEmpty())
|
|
||||||
return "";
|
|
||||||
|
|
||||||
String rawDescription = description.get();
|
|
||||||
if (rawDescription.indexOf('<') >= 0) {
|
if (rawDescription.indexOf('<') >= 0) {
|
||||||
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
||||||
}
|
}
|
||||||
@@ -58,15 +54,18 @@ public record FeedItem(String title,
|
|||||||
|
|
||||||
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
||||||
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
||||||
private static String getItemDate(Item item) {
|
private static String getItemDate(SimpleFeedParser.ItemData item) {
|
||||||
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
||||||
try {
|
try {
|
||||||
zonedDateTime = item.getPubDateZonedDateTime();
|
zonedDateTime = item.getPubDateZonedDateTime();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
zonedDateTime = item.getPubDate()
|
try {
|
||||||
.map(extraFormatter::parse)
|
zonedDateTime = Optional.of(ZonedDateTime.from(extraFormatter.parse(item.pubDate())));
|
||||||
.map(ZonedDateTime::from);
|
}
|
||||||
|
catch (Exception e2) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
||||||
|
@@ -1,66 +0,0 @@
|
|||||||
package nu.marginalia.rss.svc;
|
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.concurrent.Semaphore;
|
|
||||||
|
|
||||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
|
||||||
* crawling the same domain at the same time.
|
|
||||||
*/
|
|
||||||
public class DomainLocks {
|
|
||||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
|
||||||
// relatively big, but should be manageable since the number of domains is limited to
|
|
||||||
// a few hundred thousand typically.
|
|
||||||
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
|
||||||
|
|
||||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
|
||||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
|
||||||
*/
|
|
||||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
|
||||||
return new DomainLock(domain.toString(),
|
|
||||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
|
||||||
}
|
|
||||||
|
|
||||||
private Semaphore defaultPermits(String topDomain) {
|
|
||||||
if (topDomain.equals("wordpress.com"))
|
|
||||||
return new Semaphore(16);
|
|
||||||
if (topDomain.equals("blogspot.com"))
|
|
||||||
return new Semaphore(8);
|
|
||||||
|
|
||||||
if (topDomain.equals("neocities.org"))
|
|
||||||
return new Semaphore(4);
|
|
||||||
if (topDomain.equals("github.io"))
|
|
||||||
return new Semaphore(4);
|
|
||||||
|
|
||||||
if (topDomain.equals("substack.com")) {
|
|
||||||
return new Semaphore(1);
|
|
||||||
}
|
|
||||||
if (topDomain.endsWith(".edu")) {
|
|
||||||
return new Semaphore(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Semaphore(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class DomainLock implements AutoCloseable {
|
|
||||||
private final String domainName;
|
|
||||||
private final Semaphore semaphore;
|
|
||||||
|
|
||||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
|
||||||
this.domainName = domainName;
|
|
||||||
this.semaphore = semaphore;
|
|
||||||
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName + " [await domain lock]");
|
|
||||||
semaphore.acquire();
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() {
|
|
||||||
semaphore.release();
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName + " [wrapping up]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@@ -1,12 +1,12 @@
|
|||||||
package nu.marginalia.rss.svc;
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
|
||||||
import com.apptasticsoftware.rssreader.RssReader;
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
|
import nu.marginalia.coordination.DomainCoordinator;
|
||||||
|
import nu.marginalia.coordination.DomainLock;
|
||||||
import nu.marginalia.executor.client.ExecutorClient;
|
import nu.marginalia.executor.client.ExecutorClient;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
@@ -20,7 +20,6 @@ import nu.marginalia.storage.FileStorageService;
|
|||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -32,11 +31,11 @@ import java.net.URISyntaxException;
|
|||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.*;
|
import java.time.*;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@@ -48,20 +47,19 @@ public class FeedFetcherService {
|
|||||||
private static final int MAX_FEED_ITEMS = 10;
|
private static final int MAX_FEED_ITEMS = 10;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
||||||
|
|
||||||
private final RssReader rssReader = new RssReader();
|
|
||||||
|
|
||||||
private final FeedDb feedDb;
|
private final FeedDb feedDb;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final NodeConfigurationService nodeConfigurationService;
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
private final ServiceHeartbeat serviceHeartbeat;
|
private final ServiceHeartbeat serviceHeartbeat;
|
||||||
private final ExecutorClient executorClient;
|
private final ExecutorClient executorClient;
|
||||||
|
|
||||||
private final DomainLocks domainLocks = new DomainLocks();
|
private final DomainCoordinator domainCoordinator;
|
||||||
|
|
||||||
private volatile boolean updating;
|
private volatile boolean updating;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeedFetcherService(FeedDb feedDb,
|
public FeedFetcherService(FeedDb feedDb,
|
||||||
|
DomainCoordinator domainCoordinator,
|
||||||
FileStorageService fileStorageService,
|
FileStorageService fileStorageService,
|
||||||
NodeConfigurationService nodeConfigurationService,
|
NodeConfigurationService nodeConfigurationService,
|
||||||
ServiceHeartbeat serviceHeartbeat,
|
ServiceHeartbeat serviceHeartbeat,
|
||||||
@@ -72,23 +70,13 @@ public class FeedFetcherService {
|
|||||||
this.nodeConfigurationService = nodeConfigurationService;
|
this.nodeConfigurationService = nodeConfigurationService;
|
||||||
this.serviceHeartbeat = serviceHeartbeat;
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
this.executorClient = executorClient;
|
this.executorClient = executorClient;
|
||||||
|
this.domainCoordinator = domainCoordinator;
|
||||||
|
|
||||||
// Add support for some alternate date tags for atom
|
|
||||||
rssReader.addItemExtension("issued", this::setDateFallback);
|
|
||||||
rssReader.addItemExtension("created", this::setDateFallback);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setDateFallback(Item item, String value) {
|
|
||||||
if (item.getPubDate().isEmpty()) {
|
|
||||||
item.setPubDate(value);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
CLEAN,
|
CLEAN,
|
||||||
REFRESH
|
REFRESH
|
||||||
};
|
}
|
||||||
|
|
||||||
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
||||||
if (updating) // Prevent concurrent updates
|
if (updating) // Prevent concurrent updates
|
||||||
@@ -96,6 +84,7 @@ public class FeedFetcherService {
|
|||||||
throw new IllegalStateException("Already updating feeds, refusing to start another update");
|
throw new IllegalStateException("Already updating feeds, refusing to start another update");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
try (FeedDbWriter writer = feedDb.createWriter();
|
try (FeedDbWriter writer = feedDb.createWriter();
|
||||||
HttpClient client = HttpClient.newBuilder()
|
HttpClient client = HttpClient.newBuilder()
|
||||||
.connectTimeout(Duration.ofSeconds(15))
|
.connectTimeout(Duration.ofSeconds(15))
|
||||||
@@ -103,6 +92,8 @@ public class FeedFetcherService {
|
|||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||||
.version(HttpClient.Version.HTTP_2)
|
.version(HttpClient.Version.HTTP_2)
|
||||||
.build();
|
.build();
|
||||||
|
ExecutorService fetchExecutor = Executors.newCachedThreadPool();
|
||||||
|
FeedJournal feedJournal = FeedJournal.create();
|
||||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||||
) {
|
) {
|
||||||
updating = true;
|
updating = true;
|
||||||
@@ -145,8 +136,8 @@ public class FeedFetcherService {
|
|||||||
};
|
};
|
||||||
|
|
||||||
FetchResult feedData;
|
FetchResult feedData;
|
||||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
try (DomainLock domainLock = domainCoordinator.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||||
feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
|
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
feedData = new FetchResult.TransientError();
|
feedData = new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
@@ -155,6 +146,8 @@ public class FeedFetcherService {
|
|||||||
case FetchResult.Success(String value, String etag) -> {
|
case FetchResult.Success(String value, String etag) -> {
|
||||||
writer.saveEtag(feed.domain(), etag);
|
writer.saveEtag(feed.domain(), etag);
|
||||||
writer.saveFeed(parseFeed(value, feed));
|
writer.saveFeed(parseFeed(value, feed));
|
||||||
|
|
||||||
|
feedJournal.record(feed.feedUrl(), value);
|
||||||
}
|
}
|
||||||
case FetchResult.NotModified() -> {
|
case FetchResult.NotModified() -> {
|
||||||
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
||||||
@@ -224,6 +217,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||||
HttpClient client,
|
HttpClient client,
|
||||||
|
ExecutorService executorService,
|
||||||
@Nullable String ifModifiedSinceDate,
|
@Nullable String ifModifiedSinceDate,
|
||||||
@Nullable String ifNoneMatchTag)
|
@Nullable String ifNoneMatchTag)
|
||||||
{
|
{
|
||||||
@@ -239,18 +233,27 @@ public class FeedFetcherService {
|
|||||||
.timeout(Duration.ofSeconds(15))
|
.timeout(Duration.ofSeconds(15))
|
||||||
;
|
;
|
||||||
|
|
||||||
if (ifModifiedSinceDate != null) {
|
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||||
|
// though since there are certain idiosyncrasies in server implementations,
|
||||||
|
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||||
|
if (ifNoneMatchTag != null) {
|
||||||
|
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||||
|
} else if (ifModifiedSinceDate != null) {
|
||||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ifNoneMatchTag != null) {
|
|
||||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
|
||||||
}
|
|
||||||
|
|
||||||
HttpRequest getRequest = requestBuilder.build();
|
HttpRequest getRequest = requestBuilder.build();
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
|
||||||
|
/* Note we need to use an executor to time-limit the send() method in HttpClient, as
|
||||||
|
* its support for timeouts only applies to the time until response starts to be received,
|
||||||
|
* and does not catch the case when the server starts to send data but then hangs.
|
||||||
|
*/
|
||||||
|
HttpResponse<byte[]> rs = executorService.submit(
|
||||||
|
() -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
|
||||||
|
.get(15, TimeUnit.SECONDS);
|
||||||
|
|
||||||
if (rs.statusCode() == 429) { // Too Many Requests
|
if (rs.statusCode() == 429) { // Too Many Requests
|
||||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||||
@@ -367,12 +370,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||||
try {
|
try {
|
||||||
feedData = sanitizeEntities(feedData);
|
List<SimpleFeedParser.ItemData> rawItems = SimpleFeedParser.parse(feedData);
|
||||||
|
|
||||||
List<Item> rawItems = rssReader.read(
|
|
||||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
|
||||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
|
||||||
).toList();
|
|
||||||
|
|
||||||
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
||||||
|
|
||||||
@@ -395,33 +393,6 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
|
||||||
"»", "»",
|
|
||||||
"«", "«",
|
|
||||||
"—", "--",
|
|
||||||
"–", "-",
|
|
||||||
"’", "'",
|
|
||||||
"‘", "'",
|
|
||||||
""", "\"",
|
|
||||||
" ", ""
|
|
||||||
);
|
|
||||||
|
|
||||||
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
|
||||||
* which is unfortunately relatively common. Replace them as far as is possible
|
|
||||||
* with their corresponding characters
|
|
||||||
*/
|
|
||||||
static String sanitizeEntities(String feedData) {
|
|
||||||
String result = feedData;
|
|
||||||
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
|
||||||
result = result.replace(entry.getKey(), entry.getValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle lone ampersands not part of a recognized XML entity
|
|
||||||
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Decide whether to keep URI fragments in the feed items.
|
/** Decide whether to keep URI fragments in the feed items.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* We keep fragments if there are multiple different fragments in the items.
|
* We keep fragments if there are multiple different fragments in the items.
|
||||||
@@ -429,16 +400,16 @@ public class FeedFetcherService {
|
|||||||
* @param items The items to check
|
* @param items The items to check
|
||||||
* @return True if we should keep the fragments, false otherwise
|
* @return True if we should keep the fragments, false otherwise
|
||||||
*/
|
*/
|
||||||
private boolean areFragmentsDisparate(List<Item> items) {
|
private boolean areFragmentsDisparate(List<SimpleFeedParser.ItemData> items) {
|
||||||
Set<String> seenFragments = new HashSet<>();
|
Set<String> seenFragments = new HashSet<>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (var item : items) {
|
for (var item : items) {
|
||||||
if (item.getLink().isEmpty()) {
|
if (item.url().isBlank()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var link = item.getLink().get();
|
var link = item.url();
|
||||||
if (!link.contains("#")) {
|
if (!link.contains("#")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,76 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.slop.SlopTable;
|
||||||
|
import nu.marginalia.slop.column.string.StringColumn;
|
||||||
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
|
||||||
|
/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
|
||||||
|
*/
|
||||||
|
public interface FeedJournal extends AutoCloseable {
|
||||||
|
StringColumn urlColumn = new StringColumn("url");
|
||||||
|
StringColumn contentsColumn = new StringColumn("contents", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||||
|
|
||||||
|
void record(String url, String contents) throws IOException;
|
||||||
|
void close() throws IOException;
|
||||||
|
|
||||||
|
|
||||||
|
static FeedJournal create() throws IOException {
|
||||||
|
if (Boolean.getBoolean("feedFetcher.persistJournal")) {
|
||||||
|
Path journalPath = WmsaHome.getDataPath().resolve("feed-journal");
|
||||||
|
if (Files.isDirectory(journalPath)) {
|
||||||
|
FileUtils.deleteDirectory(journalPath.toFile());
|
||||||
|
}
|
||||||
|
Files.createDirectories(journalPath);
|
||||||
|
return new RecordingFeedJournal(journalPath);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return new NoOpFeedJournal();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class NoOpFeedJournal implements FeedJournal {
|
||||||
|
@Override
|
||||||
|
public void record(String url, String contents) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {}
|
||||||
|
}
|
||||||
|
|
||||||
|
class RecordingFeedJournal extends SlopTable implements FeedJournal {
|
||||||
|
|
||||||
|
private final StringColumn.Writer urlWriter;
|
||||||
|
private final StringColumn.Writer contentsWriter;
|
||||||
|
|
||||||
|
public RecordingFeedJournal(Path path) throws IOException {
|
||||||
|
super(path, SlopTable.getNumPages(path, FeedJournal.urlColumn));
|
||||||
|
|
||||||
|
urlWriter = urlColumn.create(this);
|
||||||
|
contentsWriter = contentsColumn.create(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void record(String url, String contents) throws IOException {
|
||||||
|
urlWriter.put(url);
|
||||||
|
contentsWriter.put(contents);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void replay(Path journalPath, BiConsumer<String, String> urlAndContent) throws IOException {
|
||||||
|
try (SlopTable table = new SlopTable(journalPath)) {
|
||||||
|
final StringColumn.Reader urlReader = urlColumn.open(table);
|
||||||
|
final StringColumn.Reader contentsReader = contentsColumn.open(table);
|
||||||
|
|
||||||
|
while (urlReader.hasRemaining()) {
|
||||||
|
urlAndContent.accept(urlReader.get(), contentsReader.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,102 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import com.apptasticsoftware.rssreader.DateTimeParser;
|
||||||
|
import com.apptasticsoftware.rssreader.util.Default;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.parser.Parser;
|
||||||
|
|
||||||
|
import java.time.ZonedDateTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class SimpleFeedParser {
|
||||||
|
|
||||||
|
private static final DateTimeParser dateTimeParser = Default.getDateTimeParser();
|
||||||
|
|
||||||
|
public record ItemData (
|
||||||
|
String title,
|
||||||
|
String description,
|
||||||
|
String url,
|
||||||
|
String pubDate
|
||||||
|
) {
|
||||||
|
public boolean isWellFormed() {
|
||||||
|
return title != null && !title.isBlank() &&
|
||||||
|
description != null && !description.isBlank() &&
|
||||||
|
url != null && !url.isBlank() &&
|
||||||
|
pubDate != null && !pubDate.isBlank();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<ZonedDateTime> getPubDateZonedDateTime() {
|
||||||
|
try {
|
||||||
|
return Optional.ofNullable(dateTimeParser.parse(pubDate()));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<ItemData> parse(String content) {
|
||||||
|
var doc = Jsoup.parse(content, Parser.xmlParser());
|
||||||
|
List<ItemData> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
doc.select("item, entry").forEach(element -> {
|
||||||
|
String link = "";
|
||||||
|
String title = "";
|
||||||
|
String description = "";
|
||||||
|
String pubDate = "";
|
||||||
|
|
||||||
|
for (String attr : List.of("title", "dc:title")) {
|
||||||
|
if (!title.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
title = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("title", "summary", "content", "description", "dc:description")) {
|
||||||
|
if (!description.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
description = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("pubDate", "published", "updated", "issued", "created", "dc:date")) {
|
||||||
|
if (!pubDate.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
pubDate = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("link", "url")) {
|
||||||
|
if (!link.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
|
||||||
|
if (tag != null) {
|
||||||
|
String linkText = tag.text();
|
||||||
|
|
||||||
|
if (linkText.isBlank()) {
|
||||||
|
linkText = tag.attr("href");
|
||||||
|
}
|
||||||
|
|
||||||
|
link = linkText;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.add(new ItemData(title, description, link, pubDate));
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,113 @@
|
|||||||
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class DomSampleDbTest {
|
||||||
|
Path tempDir;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws Exception {
|
||||||
|
tempDir = Files.createTempDirectory("test");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSetUp() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to set up database: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSyncDomains() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||||
|
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com"));
|
||||||
|
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
db.syncDomains(Set.of("foobar.com", "test.com"));
|
||||||
|
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to sync domains: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFetchDomains() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
|
||||||
|
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||||
|
db.flagDomainAsFetched("example.com");
|
||||||
|
db.flagDomainAsFetched("test.com");
|
||||||
|
db.flagDomainAsFetched("foobar.com");
|
||||||
|
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
|
||||||
|
db.flagDomainAsFetched("test.com");
|
||||||
|
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to sync domains: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void saveLoadSingle() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
|
||||||
|
var samples = db.getSamples("example.com");
|
||||||
|
assertEquals(1, samples.size());
|
||||||
|
var sample = samples.getFirst();
|
||||||
|
assertEquals("example.com", sample.domain());
|
||||||
|
assertEquals("http://example.com/sample", sample.url());
|
||||||
|
assertEquals("sample data", sample.sample());
|
||||||
|
assertEquals("requests data", sample.requests());
|
||||||
|
assertTrue(sample.acceptedPopover());
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to save/load sample: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void saveLoadTwo() {
|
||||||
|
var dbPath = tempDir.resolve("test.db");
|
||||||
|
try (var db = new DomSampleDb(dbPath)) {
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
|
||||||
|
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
|
||||||
|
var samples = db.getSamples("example.com");
|
||||||
|
assertEquals(2, samples.size());
|
||||||
|
|
||||||
|
Map<String, String> samplesByUrl = new HashMap<>();
|
||||||
|
for (var sample : samples) {
|
||||||
|
samplesByUrl.put(sample.url(), sample.sample());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
|
||||||
|
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
fail("Failed to save/load sample: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -1,36 +1,137 @@
|
|||||||
package nu.marginalia.livecapture;
|
package nu.marginalia.livecapture;
|
||||||
|
|
||||||
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
|
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.testcontainers.containers.GenericContainer;
|
import org.testcontainers.containers.GenericContainer;
|
||||||
|
import org.testcontainers.images.PullPolicy;
|
||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.DockerImageName;
|
import org.testcontainers.utility.DockerImageName;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||||
|
|
||||||
|
|
||||||
@Testcontainers
|
@Testcontainers
|
||||||
|
@Tag("slow")
|
||||||
public class BrowserlessClientTest {
|
public class BrowserlessClientTest {
|
||||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
|
// Run gradle docker if this image is not available
|
||||||
|
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
|
||||||
|
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||||
|
.withImagePullPolicy(PullPolicy.defaultPolicy())
|
||||||
|
.withNetworkMode("bridge")
|
||||||
|
.withLogConsumer(frame -> {
|
||||||
|
System.out.print(frame.getUtf8String());
|
||||||
|
})
|
||||||
|
.withExposedPorts(3000);
|
||||||
|
|
||||||
|
static WireMockServer wireMockServer =
|
||||||
|
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||||
|
.port(18089));
|
||||||
|
|
||||||
|
static String localIp;
|
||||||
|
|
||||||
|
static URI browserlessURI;
|
||||||
|
static URI browserlessWssURI;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setup() {
|
public static void setup() throws IOException {
|
||||||
container.start();
|
container.start();
|
||||||
|
|
||||||
|
browserlessURI = URI.create(String.format("http://%s:%d/",
|
||||||
|
container.getHost(),
|
||||||
|
container.getMappedPort(3000))
|
||||||
|
);
|
||||||
|
|
||||||
|
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
|
||||||
|
container.getHost(),
|
||||||
|
container.getMappedPort(3000))
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
wireMockServer.start();
|
||||||
|
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||||
|
|
||||||
|
localIp = ServiceConfigurationModule.getLocalNetworkIP();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testInspectContentUA__Flaky() throws Exception {
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
client.content("http://" + localIp + ":18089/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testInspectScreenshotUA__Flaky() throws Exception {
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
client.screenshot("http://" + localIp + ":18089/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues(),
|
||||||
|
BrowserlessClient.ScreenshotOptions.defaultValues()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testContent() throws Exception {
|
public void testContent() throws Exception {
|
||||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues());
|
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||||
Assertions.assertNotNull(content, "Content should not be null");
|
|
||||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAnnotatedContent() throws Exception {
|
||||||
|
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI);
|
||||||
|
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||||
|
) {
|
||||||
|
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||||
|
dbop.saveSample("marginalia.nu", "https://marginalia.nu/", content);
|
||||||
|
System.out.println(content);
|
||||||
|
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||||
|
|
||||||
|
dbop.getSamples("marginalia.nu").forEach(sample -> {
|
||||||
|
System.out.println("Sample URL: " + sample.url());
|
||||||
|
System.out.println("Sample Content: " + sample.sample());
|
||||||
|
System.out.println("Sample Requests: " + sample.requests());
|
||||||
|
System.out.println("Accepted Popover: " + sample.acceptedPopover());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testScreenshot() throws Exception {
|
public void testScreenshot() throws Exception {
|
||||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
|
var screenshot = client.screenshot("https://www.marginalia.nu/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues(),
|
||||||
|
BrowserlessClient.ScreenshotOptions.defaultValues());
|
||||||
|
|
||||||
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
|
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,50 +0,0 @@
|
|||||||
package nu.marginalia.rss.svc;
|
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
|
||||||
import com.apptasticsoftware.rssreader.RssReader;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
public class TestXmlSanitization {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testPreservedEntities() {
|
|
||||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
|
||||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
|
||||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
|
||||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testNlnetTitleTag() {
|
|
||||||
// The NLnet atom feed puts HTML tags in the entry/title tags, which breaks the vanilla RssReader code
|
|
||||||
|
|
||||||
// Verify we're able to consume and strip out the HTML tags
|
|
||||||
RssReader r = new RssReader();
|
|
||||||
|
|
||||||
List<Item> items = r.read(ClassLoader.getSystemResourceAsStream("nlnet.atom")).toList();
|
|
||||||
|
|
||||||
Assertions.assertEquals(1, items.size());
|
|
||||||
for (var item : items) {
|
|
||||||
Assertions.assertEquals(Optional.of("50 Free and Open Source Projects Selected for NGI Zero grants"), item.getTitle());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testStrayAmpersand() {
|
|
||||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTranslatedHtmlEntity() {
|
|
||||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTranslatedHtmlEntityQuot() {
|
|
||||||
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob""));
|
|
||||||
}
|
|
||||||
}
|
|
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.guava
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.fastutil
|
||||||
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||||
|
testImplementation libs.commons.codec
|
||||||
|
testImplementation project(':code:common:service')
|
||||||
|
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||||
|
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||||
|
testImplementation project(':code:libraries:test-helpers')
|
||||||
|
}
|
@@ -0,0 +1,192 @@
|
|||||||
|
package nu.marginalia.nsfw;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class NsfwDomainFilter {
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
private final List<String> dangerLists;
|
||||||
|
private final List<String> smutLists;
|
||||||
|
|
||||||
|
private volatile IntOpenHashSet blockedDomainIdsTier1 = new IntOpenHashSet();
|
||||||
|
private volatile IntOpenHashSet blockedDomainIdsTier2 = new IntOpenHashSet();
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(NsfwDomainFilter.class);
|
||||||
|
|
||||||
|
public static final int NSFW_DISABLE = 0;
|
||||||
|
public static final int NSFW_BLOCK_DANGER = 1;
|
||||||
|
public static final int NSFW_BLOCK_SMUT = 2;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public NsfwDomainFilter(HikariDataSource dataSource,
|
||||||
|
@Named("nsfw.dangerLists") List<String> dangerLists,
|
||||||
|
@Named("nsfw.smutLists") List<String> smutLists
|
||||||
|
) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
|
||||||
|
this.dangerLists = dangerLists;
|
||||||
|
this.smutLists = smutLists;
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon().name("NsfwDomainFilterSync").start(() -> {
|
||||||
|
while (true) {
|
||||||
|
sync();
|
||||||
|
try {
|
||||||
|
TimeUnit.HOURS.sleep(1);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
break; // Exit the loop if interrupted
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBlocked(int domainId, int tier) {
|
||||||
|
if (tier == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (tier >= 1 && blockedDomainIdsTier1.contains(domainId))
|
||||||
|
return true;
|
||||||
|
if (tier >= 2 && blockedDomainIdsTier2.contains(domainId))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private synchronized void sync() {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("SELECT ID, TIER FROM NSFW_DOMAINS")
|
||||||
|
) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
IntOpenHashSet tier1 = new IntOpenHashSet();
|
||||||
|
IntOpenHashSet tier2 = new IntOpenHashSet();
|
||||||
|
|
||||||
|
while (rs.next()) {
|
||||||
|
int domainId = rs.getInt("ID");
|
||||||
|
int tier = rs.getInt("TIER");
|
||||||
|
|
||||||
|
switch (tier) {
|
||||||
|
case 1 -> tier1.add(domainId);
|
||||||
|
case 2 -> tier2.add(domainId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.blockedDomainIdsTier1 = tier1;
|
||||||
|
this.blockedDomainIdsTier2 = tier2;
|
||||||
|
|
||||||
|
logger.info("NSFW domain filter synced: {} tier 1, {} tier 2", tier1.size(), tier2.size());
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to sync NSFW domain filter", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void fetchLists() {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
HttpClient client = HttpClient.newBuilder()
|
||||||
|
.followRedirects(HttpClient.Redirect.ALWAYS)
|
||||||
|
.build();
|
||||||
|
var stmt = conn.createStatement();
|
||||||
|
var insertStmt = conn.prepareStatement("INSERT IGNORE INTO NSFW_DOMAINS_TMP (ID, TIER) SELECT ID, ? FROM EC_DOMAIN WHERE DOMAIN_NAME = ?")) {
|
||||||
|
|
||||||
|
stmt.execute("DROP TABLE IF EXISTS NSFW_DOMAINS_TMP");
|
||||||
|
stmt.execute("CREATE TABLE NSFW_DOMAINS_TMP LIKE NSFW_DOMAINS");
|
||||||
|
|
||||||
|
List<String> combinedDangerList = new ArrayList<>(10_000);
|
||||||
|
for (var dangerListUrl : dangerLists) {
|
||||||
|
combinedDangerList.addAll(fetchList(client, dangerListUrl));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String domain : combinedDangerList) {
|
||||||
|
insertStmt.setInt(1, NSFW_BLOCK_DANGER);
|
||||||
|
insertStmt.setString(2, domain);
|
||||||
|
insertStmt.execute();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> combinedSmutList = new ArrayList<>(10_000);
|
||||||
|
for (var smutListUrl : smutLists) {
|
||||||
|
combinedSmutList.addAll(fetchList(client, smutListUrl));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String domain : combinedSmutList) {
|
||||||
|
insertStmt.setInt(1, NSFW_BLOCK_SMUT);
|
||||||
|
insertStmt.setString(2, domain);
|
||||||
|
insertStmt.addBatch();
|
||||||
|
insertStmt.execute();
|
||||||
|
}
|
||||||
|
|
||||||
|
stmt.execute("""
|
||||||
|
DROP TABLE IF EXISTS NSFW_DOMAINS
|
||||||
|
""");
|
||||||
|
stmt.execute("""
|
||||||
|
RENAME TABLE NSFW_DOMAINS_TMP TO NSFW_DOMAINS
|
||||||
|
""");
|
||||||
|
sync();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to fetch NSFW domain lists", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> fetchList(HttpClient client, String url) {
|
||||||
|
|
||||||
|
logger.info("Fetching NSFW domain list from {}", url);
|
||||||
|
|
||||||
|
var request = HttpRequest.newBuilder()
|
||||||
|
.uri(java.net.URI.create(url))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (url.endsWith(".gz")) {
|
||||||
|
var response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
|
|
||||||
|
byte[] body = response.body();
|
||||||
|
|
||||||
|
try (var reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new ByteArrayInputStream(body))))) {
|
||||||
|
return reader.lines()
|
||||||
|
.filter(StringUtils::isNotEmpty)
|
||||||
|
.toList();
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error reading GZIP response from {}", url, e);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
if (response.statusCode() == 200) {
|
||||||
|
|
||||||
|
return Arrays.stream(StringUtils.split(response.body(), "\n"))
|
||||||
|
.filter(StringUtils::isNotEmpty)
|
||||||
|
.toList();
|
||||||
|
} else {
|
||||||
|
logger.warn("Failed to fetch list from {}: HTTP {}", url, response.statusCode());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error fetching NSFW domain list from {}", url, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,30 @@
|
|||||||
|
package nu.marginalia.nsfw;
|
||||||
|
|
||||||
|
import com.google.inject.AbstractModule;
|
||||||
|
import com.google.inject.Provides;
|
||||||
|
import jakarta.inject.Named;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class NsfwFilterModule extends AbstractModule {
|
||||||
|
|
||||||
|
@Provides
|
||||||
|
@Named("nsfw.dangerLists")
|
||||||
|
public List<String> nsfwDomainLists1() {
|
||||||
|
return List.of(
|
||||||
|
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/cryptojacking/domains",
|
||||||
|
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/malware/domains",
|
||||||
|
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/phishing/domains"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
@Provides
|
||||||
|
@Named("nsfw.smutLists")
|
||||||
|
public List<String> nsfwDomainLists2() {
|
||||||
|
return List.of(
|
||||||
|
"https://github.com/olbat/ut1-blacklists/raw/refs/heads/master/blacklists/adult/domains.gz",
|
||||||
|
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/gambling/domains"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void configure() {}
|
||||||
|
}
|
@@ -0,0 +1,108 @@
|
|||||||
|
package nu.marginalia.nsfw;
|
||||||
|
|
||||||
|
|
||||||
|
import com.google.inject.AbstractModule;
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Provides;
|
||||||
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import jakarta.inject.Named;
|
||||||
|
import nu.marginalia.test.TestMigrationLoader;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.testcontainers.containers.MariaDBContainer;
|
||||||
|
import org.testcontainers.junit.jupiter.Container;
|
||||||
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
@Testcontainers
|
||||||
|
class NsfwDomainFilterTest extends AbstractModule {
|
||||||
|
|
||||||
|
@Container
|
||||||
|
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||||
|
.withDatabaseName("WMSA_prod")
|
||||||
|
.withUsername("wmsa")
|
||||||
|
.withPassword("wmsa")
|
||||||
|
.withNetworkAliases("mariadb");
|
||||||
|
|
||||||
|
static HikariDataSource dataSource;
|
||||||
|
static Path tempDir;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUpDb() throws IOException {
|
||||||
|
tempDir = Files.createTempDirectory(NsfwDomainFilterTest.class.getSimpleName());
|
||||||
|
|
||||||
|
System.setProperty("system.homePath", tempDir.toString());
|
||||||
|
|
||||||
|
HikariConfig config = new HikariConfig();
|
||||||
|
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||||
|
config.setUsername("wmsa");
|
||||||
|
config.setPassword("wmsa");
|
||||||
|
|
||||||
|
dataSource = new HikariDataSource(config);
|
||||||
|
|
||||||
|
TestMigrationLoader.flywayMigration(dataSource);
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, 1)")
|
||||||
|
) {
|
||||||
|
|
||||||
|
// Ensure the database is ready
|
||||||
|
conn.createStatement().execute("SELECT 1");
|
||||||
|
|
||||||
|
stmt.setString(1, "www.google.com");
|
||||||
|
stmt.setString(2, "google.com");
|
||||||
|
stmt.executeUpdate();
|
||||||
|
stmt.setString(1, "www.bing.com");
|
||||||
|
stmt.setString(2, "bing.com");
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("Failed to connect to the database", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Provides
|
||||||
|
@Named("nsfw.dangerLists")
|
||||||
|
public List<String> nsfwDomainLists1() {
|
||||||
|
return List.of(
|
||||||
|
"https://downloads.marginalia.nu/test/list1"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Provides
|
||||||
|
@Named("nsfw.smutLists")
|
||||||
|
public List<String> nsfwDomainLists2() {
|
||||||
|
return List.of(
|
||||||
|
"https://downloads.marginalia.nu/test/list2.gz"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void configure() {
|
||||||
|
bind(HikariDataSource.class).toInstance(dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
var filter = Guice
|
||||||
|
.createInjector(this)
|
||||||
|
.getInstance(NsfwDomainFilter.class);
|
||||||
|
|
||||||
|
filter.fetchLists();
|
||||||
|
|
||||||
|
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||||
|
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||||
|
assertFalse(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||||
|
assertTrue(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -2,9 +2,6 @@ package nu.marginalia.api.searchquery;
|
|||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
|
|
||||||
@@ -27,37 +24,19 @@ public class IndexProtobufCodec {
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static QueryLimits convertQueryLimits(RpcQueryLimits queryLimits) {
|
|
||||||
return new QueryLimits(
|
|
||||||
queryLimits.getResultsByDomain(),
|
|
||||||
queryLimits.getResultsTotal(),
|
|
||||||
queryLimits.getTimeoutMs(),
|
|
||||||
queryLimits.getFetchSize()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static RpcQueryLimits convertQueryLimits(QueryLimits queryLimits) {
|
|
||||||
return RpcQueryLimits.newBuilder()
|
|
||||||
.setResultsByDomain(queryLimits.resultsByDomain())
|
|
||||||
.setResultsTotal(queryLimits.resultsTotal())
|
|
||||||
.setTimeoutMs(queryLimits.timeoutMs())
|
|
||||||
.setFetchSize(queryLimits.fetchSize())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static SearchQuery convertRpcQuery(RpcQuery query) {
|
public static SearchQuery convertRpcQuery(RpcQuery query) {
|
||||||
List<SearchPhraseConstraint> phraeConstraints = new ArrayList<>();
|
List<SearchPhraseConstraint> phraseConstraints = new ArrayList<>();
|
||||||
|
|
||||||
for (int j = 0; j < query.getPhrasesCount(); j++) {
|
for (int j = 0; j < query.getPhrasesCount(); j++) {
|
||||||
var coh = query.getPhrases(j);
|
var coh = query.getPhrases(j);
|
||||||
if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) {
|
if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) {
|
||||||
phraeConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
|
phraseConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
|
||||||
}
|
}
|
||||||
else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) {
|
else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) {
|
||||||
phraeConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
|
phraseConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
|
||||||
}
|
}
|
||||||
else if (coh.getType() == RpcPhrases.TYPE.FULL) {
|
else if (coh.getType() == RpcPhrases.TYPE.FULL) {
|
||||||
phraeConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
|
phraseConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType());
|
throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType());
|
||||||
@@ -70,7 +49,7 @@ public class IndexProtobufCodec {
|
|||||||
query.getExcludeList(),
|
query.getExcludeList(),
|
||||||
query.getAdviceList(),
|
query.getAdviceList(),
|
||||||
query.getPriorityList(),
|
query.getPriorityList(),
|
||||||
phraeConstraints
|
phraseConstraints
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,60 +82,4 @@ public class IndexProtobufCodec {
|
|||||||
return subqueryBuilder.build();
|
return subqueryBuilder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ResultRankingParameters convertRankingParameterss(RpcResultRankingParameters params) {
|
|
||||||
if (params == null)
|
|
||||||
return ResultRankingParameters.sensibleDefaults();
|
|
||||||
|
|
||||||
return new ResultRankingParameters(
|
|
||||||
new Bm25Parameters(params.getBm25K(), params.getBm25B()),
|
|
||||||
params.getShortDocumentThreshold(),
|
|
||||||
params.getShortDocumentPenalty(),
|
|
||||||
params.getDomainRankBonus(),
|
|
||||||
params.getQualityPenalty(),
|
|
||||||
params.getShortSentenceThreshold(),
|
|
||||||
params.getShortSentencePenalty(),
|
|
||||||
params.getBm25Weight(),
|
|
||||||
params.getTcfFirstPositionWeight(),
|
|
||||||
params.getTcfVerbatimWeight(),
|
|
||||||
params.getTcfProximityWeight(),
|
|
||||||
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
|
|
||||||
params.getTemporalBiasWeight(),
|
|
||||||
params.getExportDebugData()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static RpcResultRankingParameters convertRankingParameterss(ResultRankingParameters rankingParams,
|
|
||||||
RpcTemporalBias temporalBias)
|
|
||||||
{
|
|
||||||
if (rankingParams == null) {
|
|
||||||
rankingParams = ResultRankingParameters.sensibleDefaults();
|
|
||||||
}
|
|
||||||
|
|
||||||
var builder = RpcResultRankingParameters.newBuilder()
|
|
||||||
.setBm25B(rankingParams.bm25Params.b())
|
|
||||||
.setBm25K(rankingParams.bm25Params.k())
|
|
||||||
.setShortDocumentThreshold(rankingParams.shortDocumentThreshold)
|
|
||||||
.setShortDocumentPenalty(rankingParams.shortDocumentPenalty)
|
|
||||||
.setDomainRankBonus(rankingParams.domainRankBonus)
|
|
||||||
.setQualityPenalty(rankingParams.qualityPenalty)
|
|
||||||
.setShortSentenceThreshold(rankingParams.shortSentenceThreshold)
|
|
||||||
.setShortSentencePenalty(rankingParams.shortSentencePenalty)
|
|
||||||
.setBm25Weight(rankingParams.bm25Weight)
|
|
||||||
.setTcfFirstPositionWeight(rankingParams.tcfFirstPosition)
|
|
||||||
.setTcfProximityWeight(rankingParams.tcfProximity)
|
|
||||||
.setTcfVerbatimWeight(rankingParams.tcfVerbatim)
|
|
||||||
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
|
|
||||||
.setExportDebugData(rankingParams.exportDebugData);
|
|
||||||
|
|
||||||
if (temporalBias != null && temporalBias.getBias() != RpcTemporalBias.Bias.NONE) {
|
|
||||||
builder.setTemporalBias(temporalBias);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
builder.setTemporalBias(RpcTemporalBias.newBuilder()
|
|
||||||
.setBias(RpcTemporalBias.Bias.valueOf(rankingParams.temporalBias.name())));
|
|
||||||
}
|
|
||||||
|
|
||||||
return builder.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -1,11 +1,8 @@
|
|||||||
package nu.marginalia.api.searchquery;
|
package nu.marginalia.api.searchquery;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
import nu.marginalia.api.searchquery.model.query.*;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
|
import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
|
||||||
@@ -32,12 +29,14 @@ public class QueryProtobufCodec {
|
|||||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||||
builder.setHumanQuery(request.getHumanQuery());
|
builder.setHumanQuery(request.getHumanQuery());
|
||||||
|
|
||||||
|
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
|
||||||
|
|
||||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||||
builder.setRank(IndexProtobufCodec.convertSpecLimit(query.specs.rank));
|
builder.setRank(IndexProtobufCodec.convertSpecLimit(query.specs.rank));
|
||||||
|
|
||||||
builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits));
|
builder.setQueryLimits(query.specs.queryLimits);
|
||||||
|
|
||||||
// Query strategy may be overridden by the query, but if not, use the one from the request
|
// Query strategy may be overridden by the query, but if not, use the one from the request
|
||||||
if (query.specs.queryStrategy != null && query.specs.queryStrategy != QueryStrategy.AUTO)
|
if (query.specs.queryStrategy != null && query.specs.queryStrategy != QueryStrategy.AUTO)
|
||||||
@@ -45,9 +44,27 @@ public class QueryProtobufCodec {
|
|||||||
else
|
else
|
||||||
builder.setQueryStrategy(request.getQueryStrategy());
|
builder.setQueryStrategy(request.getQueryStrategy());
|
||||||
|
|
||||||
if (query.specs.rankingParams != null) {
|
if (request.getTemporalBias().getBias() != RpcTemporalBias.Bias.NONE) {
|
||||||
builder.setParameters(IndexProtobufCodec.convertRankingParameterss(query.specs.rankingParams, request.getTemporalBias()));
|
if (query.specs.rankingParams != null) {
|
||||||
|
builder.setParameters(
|
||||||
|
RpcResultRankingParameters.newBuilder(query.specs.rankingParams)
|
||||||
|
.setTemporalBias(request.getTemporalBias())
|
||||||
|
.build()
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
builder.setParameters(
|
||||||
|
RpcResultRankingParameters.newBuilder(PrototypeRankingParameters.sensibleDefaults())
|
||||||
|
.setTemporalBias(request.getTemporalBias())
|
||||||
|
.build()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} else if (query.specs.rankingParams != null) {
|
||||||
|
builder.setParameters(query.specs.rankingParams);
|
||||||
}
|
}
|
||||||
|
// else {
|
||||||
|
// if we have no ranking params, we don't need to set them, the client check and use the default values
|
||||||
|
// so we don't need to send this huge object over the wire
|
||||||
|
// }
|
||||||
|
|
||||||
return builder.build();
|
return builder.build();
|
||||||
}
|
}
|
||||||
@@ -60,23 +77,20 @@ public class QueryProtobufCodec {
|
|||||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||||
builder.setHumanQuery(humanQuery);
|
builder.setHumanQuery(humanQuery);
|
||||||
|
|
||||||
|
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
|
||||||
|
|
||||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||||
builder.setRank(IndexProtobufCodec.convertSpecLimit(query.specs.rank));
|
builder.setRank(IndexProtobufCodec.convertSpecLimit(query.specs.rank));
|
||||||
|
|
||||||
builder.setQueryLimits(IndexProtobufCodec.convertQueryLimits(query.specs.queryLimits));
|
builder.setQueryLimits(query.specs.queryLimits);
|
||||||
|
|
||||||
// Query strategy may be overridden by the query, but if not, use the one from the request
|
// Query strategy may be overridden by the query, but if not, use the one from the request
|
||||||
builder.setQueryStrategy(query.specs.queryStrategy.name());
|
builder.setQueryStrategy(query.specs.queryStrategy.name());
|
||||||
|
|
||||||
if (query.specs.rankingParams != null) {
|
if (query.specs.rankingParams != null) {
|
||||||
builder.setParameters(IndexProtobufCodec.convertRankingParameterss(
|
builder.setParameters(query.specs.rankingParams);
|
||||||
query.specs.rankingParams,
|
|
||||||
RpcTemporalBias.newBuilder().setBias(
|
|
||||||
RpcTemporalBias.Bias.NONE)
|
|
||||||
.build())
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.build();
|
return builder.build();
|
||||||
@@ -95,10 +109,11 @@ public class QueryProtobufCodec {
|
|||||||
IndexProtobufCodec.convertSpecLimit(request.getSize()),
|
IndexProtobufCodec.convertSpecLimit(request.getSize()),
|
||||||
IndexProtobufCodec.convertSpecLimit(request.getRank()),
|
IndexProtobufCodec.convertSpecLimit(request.getRank()),
|
||||||
request.getDomainIdsList(),
|
request.getDomainIdsList(),
|
||||||
IndexProtobufCodec.convertQueryLimits(request.getQueryLimits()),
|
request.getQueryLimits(),
|
||||||
request.getSearchSetIdentifier(),
|
request.getSearchSetIdentifier(),
|
||||||
QueryStrategy.valueOf(request.getQueryStrategy()),
|
QueryStrategy.valueOf(request.getQueryStrategy()),
|
||||||
ResultRankingParameters.TemporalBias.valueOf(request.getTemporalBias().getBias().name()),
|
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
||||||
|
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
|
||||||
request.getPagination().getPage()
|
request.getPagination().getPage()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -294,9 +309,9 @@ public class QueryProtobufCodec {
|
|||||||
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getRank()),
|
IndexProtobufCodec.convertSpecLimit(specs.getRank()),
|
||||||
IndexProtobufCodec.convertQueryLimits(specs.getQueryLimits()),
|
specs.getQueryLimits(),
|
||||||
QueryStrategy.valueOf(specs.getQueryStrategy()),
|
QueryStrategy.valueOf(specs.getQueryStrategy()),
|
||||||
IndexProtobufCodec.convertRankingParameterss(specs.getParameters())
|
specs.hasParameters() ? specs.getParameters() : null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -307,19 +322,20 @@ public class QueryProtobufCodec {
|
|||||||
.addAllTacitExcludes(params.tacitExcludes())
|
.addAllTacitExcludes(params.tacitExcludes())
|
||||||
.addAllTacitPriority(params.tacitPriority())
|
.addAllTacitPriority(params.tacitPriority())
|
||||||
.setHumanQuery(params.humanQuery())
|
.setHumanQuery(params.humanQuery())
|
||||||
.setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits()))
|
.setQueryLimits(params.limits())
|
||||||
.setQuality(IndexProtobufCodec.convertSpecLimit(params.quality()))
|
.setQuality(IndexProtobufCodec.convertSpecLimit(params.quality()))
|
||||||
.setYear(IndexProtobufCodec.convertSpecLimit(params.year()))
|
.setYear(IndexProtobufCodec.convertSpecLimit(params.year()))
|
||||||
.setSize(IndexProtobufCodec.convertSpecLimit(params.size()))
|
.setSize(IndexProtobufCodec.convertSpecLimit(params.size()))
|
||||||
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
|
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
|
||||||
.setSearchSetIdentifier(params.identifier())
|
.setSearchSetIdentifier(params.identifier())
|
||||||
.setQueryStrategy(params.queryStrategy().name())
|
.setQueryStrategy(params.queryStrategy().name())
|
||||||
|
.setNsfwFilterTierValue(params.filterTier().getCodedValue())
|
||||||
.setTemporalBias(RpcTemporalBias.newBuilder()
|
.setTemporalBias(RpcTemporalBias.newBuilder()
|
||||||
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
|
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
|
||||||
.build())
|
.build())
|
||||||
.setPagination(RpcQsQueryPagination.newBuilder()
|
.setPagination(RpcQsQueryPagination.newBuilder()
|
||||||
.setPage(params.page())
|
.setPage(params.page())
|
||||||
.setPageSize(Math.min(100, params.limits().resultsTotal()))
|
.setPageSize(Math.min(100, params.limits().getResultsTotal()))
|
||||||
.build());
|
.build());
|
||||||
|
|
||||||
if (params.nearDomain() != null)
|
if (params.nearDomain() != null)
|
||||||
|
@@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.query;
|
||||||
|
|
||||||
|
public enum NsfwFilterTier {
|
||||||
|
OFF(0),
|
||||||
|
DANGER(1),
|
||||||
|
PORN_AND_GAMBLING(2);
|
||||||
|
|
||||||
|
private final int codedValue; // same as ordinal() for now, but can be changed later if needed
|
||||||
|
|
||||||
|
NsfwFilterTier(int codedValue) {
|
||||||
|
this.codedValue = codedValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static NsfwFilterTier fromCodedValue(int codedValue) {
|
||||||
|
for (NsfwFilterTier tier : NsfwFilterTier.values()) {
|
||||||
|
if (tier.codedValue == codedValue) {
|
||||||
|
return tier;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("Invalid coded value for NsfwFilterTirer: " + codedValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCodedValue() {
|
||||||
|
return codedValue;
|
||||||
|
}
|
||||||
|
}
|
@@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.api.searchquery.model.query;
|
package nu.marginalia.api.searchquery.model.query;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
|
||||||
@@ -21,14 +21,15 @@ public record QueryParams(
|
|||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
SpecificationLimit rank,
|
SpecificationLimit rank,
|
||||||
List<Integer> domainIds,
|
List<Integer> domainIds,
|
||||||
QueryLimits limits,
|
RpcQueryLimits limits,
|
||||||
String identifier,
|
String identifier,
|
||||||
QueryStrategy queryStrategy,
|
QueryStrategy queryStrategy,
|
||||||
ResultRankingParameters.TemporalBias temporalBias,
|
RpcTemporalBias.Bias temporalBias,
|
||||||
|
NsfwFilterTier filterTier,
|
||||||
int page
|
int page
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
public QueryParams(String query, QueryLimits limits, String identifier) {
|
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
|
||||||
this(query, null,
|
this(query, null,
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
@@ -42,7 +43,8 @@ public record QueryParams(
|
|||||||
limits,
|
limits,
|
||||||
identifier,
|
identifier,
|
||||||
QueryStrategy.AUTO,
|
QueryStrategy.AUTO,
|
||||||
ResultRankingParameters.TemporalBias.NONE,
|
RpcTemporalBias.Bias.NONE,
|
||||||
|
filterTier,
|
||||||
1 // page
|
1 // page
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -1,10 +1,11 @@
|
|||||||
package nu.marginalia.api.searchquery.model.query;
|
package nu.marginalia.api.searchquery.model.query;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class SearchSpecification {
|
public class SearchSpecification {
|
||||||
@@ -24,11 +25,12 @@ public class SearchSpecification {
|
|||||||
public SpecificationLimit size;
|
public SpecificationLimit size;
|
||||||
public SpecificationLimit rank;
|
public SpecificationLimit rank;
|
||||||
|
|
||||||
public final QueryLimits queryLimits;
|
public final RpcQueryLimits queryLimits;
|
||||||
|
|
||||||
public final QueryStrategy queryStrategy;
|
public final QueryStrategy queryStrategy;
|
||||||
|
|
||||||
public final ResultRankingParameters rankingParams;
|
@Nullable
|
||||||
|
public final RpcResultRankingParameters rankingParams;
|
||||||
|
|
||||||
public SearchSpecification(SearchQuery query,
|
public SearchSpecification(SearchQuery query,
|
||||||
List<Integer> domains,
|
List<Integer> domains,
|
||||||
@@ -38,9 +40,9 @@ public class SearchSpecification {
|
|||||||
SpecificationLimit year,
|
SpecificationLimit year,
|
||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
SpecificationLimit rank,
|
SpecificationLimit rank,
|
||||||
QueryLimits queryLimits,
|
RpcQueryLimits queryLimits,
|
||||||
QueryStrategy queryStrategy,
|
QueryStrategy queryStrategy,
|
||||||
ResultRankingParameters rankingParams)
|
@Nullable RpcResultRankingParameters rankingParams)
|
||||||
{
|
{
|
||||||
this.query = query;
|
this.query = query;
|
||||||
this.domains = domains;
|
this.domains = domains;
|
||||||
@@ -91,7 +93,7 @@ public class SearchSpecification {
|
|||||||
return this.rank;
|
return this.rank;
|
||||||
}
|
}
|
||||||
|
|
||||||
public QueryLimits getQueryLimits() {
|
public RpcQueryLimits getQueryLimits() {
|
||||||
return this.queryLimits;
|
return this.queryLimits;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -99,7 +101,7 @@ public class SearchSpecification {
|
|||||||
return this.queryStrategy;
|
return this.queryStrategy;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ResultRankingParameters getRankingParams() {
|
public RpcResultRankingParameters getRankingParams() {
|
||||||
return this.rankingParams;
|
return this.rankingParams;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,9 +122,9 @@ public class SearchSpecification {
|
|||||||
private boolean size$set;
|
private boolean size$set;
|
||||||
private SpecificationLimit rank$value;
|
private SpecificationLimit rank$value;
|
||||||
private boolean rank$set;
|
private boolean rank$set;
|
||||||
private QueryLimits queryLimits;
|
private RpcQueryLimits queryLimits;
|
||||||
private QueryStrategy queryStrategy;
|
private QueryStrategy queryStrategy;
|
||||||
private ResultRankingParameters rankingParams;
|
private RpcResultRankingParameters rankingParams;
|
||||||
|
|
||||||
SearchSpecificationBuilder() {
|
SearchSpecificationBuilder() {
|
||||||
}
|
}
|
||||||
@@ -171,7 +173,7 @@ public class SearchSpecification {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchSpecificationBuilder queryLimits(QueryLimits queryLimits) {
|
public SearchSpecificationBuilder queryLimits(RpcQueryLimits queryLimits) {
|
||||||
this.queryLimits = queryLimits;
|
this.queryLimits = queryLimits;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
@@ -181,7 +183,7 @@ public class SearchSpecification {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchSpecificationBuilder rankingParams(ResultRankingParameters rankingParams) {
|
public SearchSpecificationBuilder rankingParams(RpcResultRankingParameters rankingParams) {
|
||||||
this.rankingParams = rankingParams;
|
this.rankingParams = rankingParams;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results;
|
package nu.marginalia.api.searchquery.model.results;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||||
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@@ -161,4 +162,14 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
|||||||
public String toString() {
|
public String toString() {
|
||||||
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getShortFormat() {
|
||||||
|
try {
|
||||||
|
var df = DocumentFormat.valueOf(format);
|
||||||
|
return df.shortFormat;
|
||||||
|
}
|
||||||
|
catch (IllegalArgumentException e) {
|
||||||
|
return DocumentFormat.UNKNOWN.shortFormat;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.results;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||||
|
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||||
|
|
||||||
|
public class PrototypeRankingParameters {
|
||||||
|
|
||||||
|
/** These are the default ranking parameters that are used when no parameters are specified. */
|
||||||
|
|
||||||
|
private static final RpcResultRankingParameters _sensibleDefaults = RpcResultRankingParameters.newBuilder()
|
||||||
|
.setBm25B(0.5)
|
||||||
|
.setBm25K(1.2)
|
||||||
|
.setShortDocumentThreshold(2000)
|
||||||
|
.setShortDocumentPenalty(2.)
|
||||||
|
.setDomainRankBonus(1 / 100.)
|
||||||
|
.setQualityPenalty(1 / 15.)
|
||||||
|
.setShortSentenceThreshold(2)
|
||||||
|
.setShortSentencePenalty(5)
|
||||||
|
.setBm25Weight(1.)
|
||||||
|
.setTcfVerbatimWeight(1.)
|
||||||
|
.setTcfProximityWeight(1.)
|
||||||
|
.setTcfFirstPositionWeight(5)
|
||||||
|
.setTemporalBias(RpcTemporalBias.newBuilder().setBias(RpcTemporalBias.Bias.NONE))
|
||||||
|
.setTemporalBiasWeight(5.0)
|
||||||
|
.setExportDebugData(false)
|
||||||
|
.setDisablePenalties(false)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
public static RpcResultRankingParameters sensibleDefaults() {
|
||||||
|
return _sensibleDefaults;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -1,12 +1,13 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results;
|
package nu.marginalia.api.searchquery.model.results;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
|
||||||
public class ResultRankingContext {
|
public class ResultRankingContext {
|
||||||
private final int docCount;
|
private final int docCount;
|
||||||
public final ResultRankingParameters params;
|
public final RpcResultRankingParameters params;
|
||||||
|
|
||||||
|
|
||||||
public final BitSet regularMask;
|
public final BitSet regularMask;
|
||||||
@@ -21,7 +22,7 @@ public class ResultRankingContext {
|
|||||||
public final CqDataInt priorityCounts;
|
public final CqDataInt priorityCounts;
|
||||||
|
|
||||||
public ResultRankingContext(int docCount,
|
public ResultRankingContext(int docCount,
|
||||||
ResultRankingParameters params,
|
RpcResultRankingParameters params,
|
||||||
BitSet ngramsMask,
|
BitSet ngramsMask,
|
||||||
BitSet regularMask,
|
BitSet regularMask,
|
||||||
CqDataInt fullCounts,
|
CqDataInt fullCounts,
|
||||||
|
@@ -1,278 +0,0 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results;
|
|
||||||
|
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
public class ResultRankingParameters {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tuning for BM25 when applied to full document matches
|
|
||||||
*/
|
|
||||||
public final Bm25Parameters bm25Params;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Documents below this length are penalized
|
|
||||||
*/
|
|
||||||
public int shortDocumentThreshold;
|
|
||||||
|
|
||||||
public double shortDocumentPenalty;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Scaling factor associated with domain rank (unscaled rank value is 0-255; high is good)
|
|
||||||
*/
|
|
||||||
public double domainRankBonus;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Scaling factor associated with document quality (unscaled rank value is 0-15; high is bad)
|
|
||||||
*/
|
|
||||||
public double qualityPenalty;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Average sentence length values below this threshold are penalized, range [0-4), 2 or 3 is probably what you want
|
|
||||||
*/
|
|
||||||
public int shortSentenceThreshold;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Magnitude of penalty for documents with low average sentence length
|
|
||||||
*/
|
|
||||||
public double shortSentencePenalty;
|
|
||||||
|
|
||||||
public double bm25Weight;
|
|
||||||
public double tcfFirstPosition;
|
|
||||||
public double tcfVerbatim;
|
|
||||||
public double tcfProximity;
|
|
||||||
|
|
||||||
public TemporalBias temporalBias;
|
|
||||||
public double temporalBiasWeight;
|
|
||||||
|
|
||||||
public boolean exportDebugData;
|
|
||||||
|
|
||||||
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean exportDebugData) {
|
|
||||||
this.bm25Params = bm25Params;
|
|
||||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
|
||||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
|
||||||
this.domainRankBonus = domainRankBonus;
|
|
||||||
this.qualityPenalty = qualityPenalty;
|
|
||||||
this.shortSentenceThreshold = shortSentenceThreshold;
|
|
||||||
this.shortSentencePenalty = shortSentencePenalty;
|
|
||||||
this.bm25Weight = bm25Weight;
|
|
||||||
this.tcfFirstPosition = tcfFirstPosition;
|
|
||||||
this.tcfVerbatim = tcfVerbatim;
|
|
||||||
this.tcfProximity = tcfProximity;
|
|
||||||
this.temporalBias = temporalBias;
|
|
||||||
this.temporalBiasWeight = temporalBiasWeight;
|
|
||||||
this.exportDebugData = exportDebugData;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static ResultRankingParameters sensibleDefaults() {
|
|
||||||
return builder()
|
|
||||||
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
|
||||||
.shortDocumentThreshold(2000)
|
|
||||||
.shortDocumentPenalty(2.)
|
|
||||||
.domainRankBonus(1 / 100.)
|
|
||||||
.qualityPenalty(1 / 15.)
|
|
||||||
.shortSentenceThreshold(2)
|
|
||||||
.shortSentencePenalty(5)
|
|
||||||
.bm25Weight(1.)
|
|
||||||
.tcfVerbatim(1.)
|
|
||||||
.tcfProximity(1.)
|
|
||||||
.tcfFirstPosition(5)
|
|
||||||
.temporalBias(TemporalBias.NONE)
|
|
||||||
.temporalBiasWeight(5.0)
|
|
||||||
.exportDebugData(false)
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static ResultRankingParametersBuilder builder() {
|
|
||||||
return new ResultRankingParametersBuilder();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Bm25Parameters getBm25Params() {
|
|
||||||
return this.bm25Params;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getShortDocumentThreshold() {
|
|
||||||
return this.shortDocumentThreshold;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getShortDocumentPenalty() {
|
|
||||||
return this.shortDocumentPenalty;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getDomainRankBonus() {
|
|
||||||
return this.domainRankBonus;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getQualityPenalty() {
|
|
||||||
return this.qualityPenalty;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getShortSentenceThreshold() {
|
|
||||||
return this.shortSentenceThreshold;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getShortSentencePenalty() {
|
|
||||||
return this.shortSentencePenalty;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getBm25Weight() {
|
|
||||||
return this.bm25Weight;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getTcfFirstPosition() {
|
|
||||||
return this.tcfFirstPosition;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getTcfVerbatim() {
|
|
||||||
return this.tcfVerbatim;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getTcfProximity() {
|
|
||||||
return this.tcfProximity;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TemporalBias getTemporalBias() {
|
|
||||||
return this.temporalBias;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getTemporalBiasWeight() {
|
|
||||||
return this.temporalBiasWeight;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isExportDebugData() {
|
|
||||||
return this.exportDebugData;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public final boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (!(o instanceof ResultRankingParameters that)) return false;
|
|
||||||
|
|
||||||
return shortDocumentThreshold == that.shortDocumentThreshold && Double.compare(shortDocumentPenalty, that.shortDocumentPenalty) == 0 && Double.compare(domainRankBonus, that.domainRankBonus) == 0 && Double.compare(qualityPenalty, that.qualityPenalty) == 0 && shortSentenceThreshold == that.shortSentenceThreshold && Double.compare(shortSentencePenalty, that.shortSentencePenalty) == 0 && Double.compare(bm25Weight, that.bm25Weight) == 0 && Double.compare(tcfFirstPosition, that.tcfFirstPosition) == 0 && Double.compare(tcfVerbatim, that.tcfVerbatim) == 0 && Double.compare(tcfProximity, that.tcfProximity) == 0 && Double.compare(temporalBiasWeight, that.temporalBiasWeight) == 0 && exportDebugData == that.exportDebugData && Objects.equals(bm25Params, that.bm25Params) && temporalBias == that.temporalBias;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
int result = Objects.hashCode(bm25Params);
|
|
||||||
result = 31 * result + shortDocumentThreshold;
|
|
||||||
result = 31 * result + Double.hashCode(shortDocumentPenalty);
|
|
||||||
result = 31 * result + Double.hashCode(domainRankBonus);
|
|
||||||
result = 31 * result + Double.hashCode(qualityPenalty);
|
|
||||||
result = 31 * result + shortSentenceThreshold;
|
|
||||||
result = 31 * result + Double.hashCode(shortSentencePenalty);
|
|
||||||
result = 31 * result + Double.hashCode(bm25Weight);
|
|
||||||
result = 31 * result + Double.hashCode(tcfFirstPosition);
|
|
||||||
result = 31 * result + Double.hashCode(tcfVerbatim);
|
|
||||||
result = 31 * result + Double.hashCode(tcfProximity);
|
|
||||||
result = 31 * result + Objects.hashCode(temporalBias);
|
|
||||||
result = 31 * result + Double.hashCode(temporalBiasWeight);
|
|
||||||
result = 31 * result + Boolean.hashCode(exportDebugData);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "ResultRankingParameters(bm25Params=" + this.getBm25Params() + ", shortDocumentThreshold=" + this.getShortDocumentThreshold() + ", shortDocumentPenalty=" + this.getShortDocumentPenalty() + ", domainRankBonus=" + this.getDomainRankBonus() + ", qualityPenalty=" + this.getQualityPenalty() + ", shortSentenceThreshold=" + this.getShortSentenceThreshold() + ", shortSentencePenalty=" + this.getShortSentencePenalty() + ", bm25Weight=" + this.getBm25Weight() + ", tcfFirstPosition=" + this.getTcfFirstPosition() + ", tcfVerbatim=" + this.getTcfVerbatim() + ", tcfProximity=" + this.getTcfProximity() + ", temporalBias=" + this.getTemporalBias() + ", temporalBiasWeight=" + this.getTemporalBiasWeight() + ", exportDebugData=" + this.isExportDebugData() + ")";
|
|
||||||
}
|
|
||||||
|
|
||||||
public enum TemporalBias {
|
|
||||||
RECENT, OLD, NONE
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class ResultRankingParametersBuilder {
|
|
||||||
private Bm25Parameters bm25Params;
|
|
||||||
private int shortDocumentThreshold;
|
|
||||||
private double shortDocumentPenalty;
|
|
||||||
private double domainRankBonus;
|
|
||||||
private double qualityPenalty;
|
|
||||||
private int shortSentenceThreshold;
|
|
||||||
private double shortSentencePenalty;
|
|
||||||
private double bm25Weight;
|
|
||||||
private double tcfFirstPosition;
|
|
||||||
private double tcfVerbatim;
|
|
||||||
private double tcfProximity;
|
|
||||||
private TemporalBias temporalBias;
|
|
||||||
private double temporalBiasWeight;
|
|
||||||
private boolean exportDebugData;
|
|
||||||
|
|
||||||
ResultRankingParametersBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder bm25Params(Bm25Parameters bm25Params) {
|
|
||||||
this.bm25Params = bm25Params;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder shortDocumentThreshold(int shortDocumentThreshold) {
|
|
||||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder shortDocumentPenalty(double shortDocumentPenalty) {
|
|
||||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder domainRankBonus(double domainRankBonus) {
|
|
||||||
this.domainRankBonus = domainRankBonus;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder qualityPenalty(double qualityPenalty) {
|
|
||||||
this.qualityPenalty = qualityPenalty;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder shortSentenceThreshold(int shortSentenceThreshold) {
|
|
||||||
this.shortSentenceThreshold = shortSentenceThreshold;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder shortSentencePenalty(double shortSentencePenalty) {
|
|
||||||
this.shortSentencePenalty = shortSentencePenalty;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder bm25Weight(double bm25Weight) {
|
|
||||||
this.bm25Weight = bm25Weight;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder tcfFirstPosition(double tcfFirstPosition) {
|
|
||||||
this.tcfFirstPosition = tcfFirstPosition;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder tcfVerbatim(double tcfVerbatim) {
|
|
||||||
this.tcfVerbatim = tcfVerbatim;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder tcfProximity(double tcfProximity) {
|
|
||||||
this.tcfProximity = tcfProximity;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder temporalBias(TemporalBias temporalBias) {
|
|
||||||
this.temporalBias = temporalBias;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder temporalBiasWeight(double temporalBiasWeight) {
|
|
||||||
this.temporalBiasWeight = temporalBiasWeight;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParametersBuilder exportDebugData(boolean exportDebugData) {
|
|
||||||
this.exportDebugData = exportDebugData;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ResultRankingParameters build() {
|
|
||||||
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.exportDebugData);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "ResultRankingParameters.ResultRankingParametersBuilder(bm25Params=" + this.bm25Params + ", shortDocumentThreshold=" + this.shortDocumentThreshold + ", shortDocumentPenalty=" + this.shortDocumentPenalty + ", domainRankBonus=" + this.domainRankBonus + ", qualityPenalty=" + this.qualityPenalty + ", shortSentenceThreshold=" + this.shortSentenceThreshold + ", shortSentencePenalty=" + this.shortSentencePenalty + ", bm25Weight=" + this.bm25Weight + ", tcfFirstPosition=" + this.tcfFirstPosition + ", tcfVerbatim=" + this.tcfVerbatim + ", tcfProximity=" + this.tcfProximity + ", temporalBias=" + this.temporalBias + ", temporalBiasWeight=" + this.temporalBiasWeight + ", exportDebugData=" + this.exportDebugData + ")";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@@ -32,6 +32,14 @@ message RpcQsQuery {
|
|||||||
RpcTemporalBias temporalBias = 16;
|
RpcTemporalBias temporalBias = 16;
|
||||||
|
|
||||||
RpcQsQueryPagination pagination = 17;
|
RpcQsQueryPagination pagination = 17;
|
||||||
|
|
||||||
|
NSFW_FILTER_TIER nsfwFilterTier = 18;
|
||||||
|
|
||||||
|
enum NSFW_FILTER_TIER {
|
||||||
|
NONE = 0;
|
||||||
|
DANGER = 1;
|
||||||
|
PORN_AND_GAMBLING = 2;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Query service query response */
|
/* Query service query response */
|
||||||
@@ -78,8 +86,17 @@ message RpcIndexQuery {
|
|||||||
RpcQueryLimits queryLimits = 10;
|
RpcQueryLimits queryLimits = 10;
|
||||||
string queryStrategy = 11; // Named query configuration
|
string queryStrategy = 11; // Named query configuration
|
||||||
RpcResultRankingParameters parameters = 12;
|
RpcResultRankingParameters parameters = 12;
|
||||||
|
|
||||||
|
NSFW_FILTER_TIER nsfwFilterTier = 13;
|
||||||
|
|
||||||
|
enum NSFW_FILTER_TIER {
|
||||||
|
NONE = 0;
|
||||||
|
DANGER = 1;
|
||||||
|
PORN_AND_GAMBLING = 2;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* A tagged union encoding some limit on a field */
|
/* A tagged union encoding some limit on a field */
|
||||||
message RpcSpecLimit {
|
message RpcSpecLimit {
|
||||||
int32 value = 1;
|
int32 value = 1;
|
||||||
@@ -162,6 +179,7 @@ message RpcResultRankingParameters {
|
|||||||
double temporalBiasWeight = 17;
|
double temporalBiasWeight = 17;
|
||||||
|
|
||||||
bool exportDebugData = 18;
|
bool exportDebugData = 18;
|
||||||
|
bool disablePenalties = 19;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -3,8 +3,6 @@ package nu.marginalia.index.client;
|
|||||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@@ -22,18 +20,6 @@ class IndexProtobufCodecTest {
|
|||||||
verifyIsIdentityTransformation(SpecificationLimit.lessThan(1), l -> IndexProtobufCodec.convertSpecLimit(IndexProtobufCodec.convertSpecLimit(l)));
|
verifyIsIdentityTransformation(SpecificationLimit.lessThan(1), l -> IndexProtobufCodec.convertSpecLimit(IndexProtobufCodec.convertSpecLimit(l)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testRankingParameters() {
|
|
||||||
verifyIsIdentityTransformation(ResultRankingParameters.sensibleDefaults(),
|
|
||||||
p -> IndexProtobufCodec.convertRankingParameterss(IndexProtobufCodec.convertRankingParameterss(p, null)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testQueryLimits() {
|
|
||||||
verifyIsIdentityTransformation(new QueryLimits(1,2,3,4),
|
|
||||||
l -> IndexProtobufCodec.convertQueryLimits(IndexProtobufCodec.convertQueryLimits(l))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
@Test
|
@Test
|
||||||
public void testSubqery() {
|
public void testSubqery() {
|
||||||
verifyIsIdentityTransformation(new SearchQuery(
|
verifyIsIdentityTransformation(new SearchQuery(
|
||||||
|
@@ -19,6 +19,7 @@ dependencies {
|
|||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
|
||||||
|
implementation project(':code:functions:nsfw-domain-filter')
|
||||||
implementation project(':code:functions:search-query:api')
|
implementation project(':code:functions:search-query:api')
|
||||||
|
|
||||||
implementation project(':code:index:query')
|
implementation project(':code:index:query')
|
||||||
|
@@ -2,8 +2,9 @@ package nu.marginalia.functions.searchquery;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||||
|
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.query.*;
|
import nu.marginalia.api.searchquery.model.query.*;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||||
@@ -36,7 +37,7 @@ public class QueryFactory {
|
|||||||
|
|
||||||
|
|
||||||
public ProcessedQuery createQuery(QueryParams params,
|
public ProcessedQuery createQuery(QueryParams params,
|
||||||
@Nullable ResultRankingParameters rankingParams) {
|
@Nullable RpcResultRankingParameters rankingParams) {
|
||||||
final var query = params.humanQuery();
|
final var query = params.humanQuery();
|
||||||
|
|
||||||
if (query.length() > 1000) {
|
if (query.length() > 1000) {
|
||||||
@@ -132,7 +133,9 @@ public class QueryFactory {
|
|||||||
var limits = params.limits();
|
var limits = params.limits();
|
||||||
// Disable limits on number of results per domain if we're searching with a site:-type term
|
// Disable limits on number of results per domain if we're searching with a site:-type term
|
||||||
if (domain != null) {
|
if (domain != null) {
|
||||||
limits = limits.forSingleDomain();
|
limits = RpcQueryLimits.newBuilder(limits)
|
||||||
|
.setResultsByDomain(limits.getResultsTotal())
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
|
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
|
||||||
|
@@ -9,8 +9,9 @@ import nu.marginalia.api.searchquery.*;
|
|||||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||||
import nu.marginalia.index.api.IndexClient;
|
import nu.marginalia.index.api.IndexClient;
|
||||||
|
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||||
import nu.marginalia.service.server.DiscoverableService;
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -34,13 +35,16 @@ public class QueryGRPCService
|
|||||||
|
|
||||||
|
|
||||||
private final QueryFactory queryFactory;
|
private final QueryFactory queryFactory;
|
||||||
|
private final NsfwDomainFilter nsfwDomainFilter;
|
||||||
private final IndexClient indexClient;
|
private final IndexClient indexClient;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public QueryGRPCService(QueryFactory queryFactory,
|
public QueryGRPCService(QueryFactory queryFactory,
|
||||||
|
NsfwDomainFilter nsfwDomainFilter,
|
||||||
IndexClient indexClient)
|
IndexClient indexClient)
|
||||||
{
|
{
|
||||||
this.queryFactory = queryFactory;
|
this.queryFactory = queryFactory;
|
||||||
|
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||||
this.indexClient = indexClient;
|
this.indexClient = indexClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,7 +59,7 @@ public class QueryGRPCService
|
|||||||
.time(() -> {
|
.time(() -> {
|
||||||
|
|
||||||
var params = QueryProtobufCodec.convertRequest(request);
|
var params = QueryProtobufCodec.convertRequest(request);
|
||||||
var query = queryFactory.createQuery(params, ResultRankingParameters.sensibleDefaults());
|
var query = queryFactory.createQuery(params, PrototypeRankingParameters.sensibleDefaults());
|
||||||
|
|
||||||
var indexRequest = QueryProtobufCodec.convertQuery(request, query);
|
var indexRequest = QueryProtobufCodec.convertQuery(request, query);
|
||||||
|
|
||||||
@@ -102,7 +106,7 @@ public class QueryGRPCService
|
|||||||
String originalQuery,
|
String originalQuery,
|
||||||
QueryParams params,
|
QueryParams params,
|
||||||
IndexClient.Pagination pagination,
|
IndexClient.Pagination pagination,
|
||||||
ResultRankingParameters rankingParameters) {
|
RpcResultRankingParameters rankingParameters) {
|
||||||
|
|
||||||
var query = queryFactory.createQuery(params, rankingParameters);
|
var query = queryFactory.createQuery(params, rankingParameters);
|
||||||
IndexClient.AggregateQueryResponse response = indexClient.executeQueries(QueryProtobufCodec.convertQuery(originalQuery, query), pagination);
|
IndexClient.AggregateQueryResponse response = indexClient.executeQueries(QueryProtobufCodec.convertQuery(originalQuery, query), pagination);
|
||||||
|
@@ -134,6 +134,10 @@ public class QueryExpansion {
|
|||||||
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
|
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
|
||||||
graph.addVariantForSpan(prev, qw, joinedWord);
|
graph.addVariantForSpan(prev, qw, joinedWord);
|
||||||
}
|
}
|
||||||
|
else if (StringUtils.isAlpha(prev.word()) && StringUtils.isNumeric(qw.word())) { // join e.g. trs 80 to trs80 and trs-80
|
||||||
|
graph.addVariantForSpan(prev, qw, prev.word() + qw.word());
|
||||||
|
graph.addVariantForSpan(prev, qw, prev.word() + "-" + qw.word());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prev = qw;
|
prev = qw;
|
||||||
|
@@ -233,9 +233,19 @@ public class QueryParser {
|
|||||||
entity.replace(new QueryToken.RankTerm(limit, str));
|
entity.replace(new QueryToken.RankTerm(limit, str));
|
||||||
} else if (str.startsWith("qs=")) {
|
} else if (str.startsWith("qs=")) {
|
||||||
entity.replace(new QueryToken.QsTerm(str.substring(3)));
|
entity.replace(new QueryToken.QsTerm(str.substring(3)));
|
||||||
} else if (str.contains(":")) {
|
} else if (str.startsWith("site:")
|
||||||
|
|| str.startsWith("format:")
|
||||||
|
|| str.startsWith("file:")
|
||||||
|
|| str.startsWith("tld:")
|
||||||
|
|| str.startsWith("ip:")
|
||||||
|
|| str.startsWith("as:")
|
||||||
|
|| str.startsWith("asn:")
|
||||||
|
|| str.startsWith("generator:")
|
||||||
|
)
|
||||||
|
{
|
||||||
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
|
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static SpecificationLimit parseSpecificationLimit(String str) {
|
private static SpecificationLimit parseSpecificationLimit(String str) {
|
||||||
|
@@ -1,12 +1,13 @@
|
|||||||
package nu.marginalia.query.svc;
|
package nu.marginalia.query.svc;
|
||||||
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||||
|
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
|
||||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
@@ -49,10 +50,16 @@ public class QueryFactoryTest {
|
|||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
null,
|
null,
|
||||||
new QueryLimits(100, 100, 100, 100),
|
RpcQueryLimits.newBuilder()
|
||||||
|
.setResultsTotal(100)
|
||||||
|
.setResultsByDomain(100)
|
||||||
|
.setTimeoutMs(100)
|
||||||
|
.setFetchSize(100)
|
||||||
|
.build(),
|
||||||
"NONE",
|
"NONE",
|
||||||
QueryStrategy.AUTO,
|
QueryStrategy.AUTO,
|
||||||
ResultRankingParameters.TemporalBias.NONE,
|
RpcTemporalBias.Bias.NONE,
|
||||||
|
NsfwFilterTier.OFF,
|
||||||
0), null).specs;
|
0), null).specs;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,6 +215,24 @@ public class QueryFactoryTest {
|
|||||||
System.out.println(subquery);
|
System.out.println(subquery);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testContractionWordNum() {
|
||||||
|
var subquery = parseAndGetSpecs("glove 80");
|
||||||
|
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" 80 "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove-80 "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove80 "));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCplusPlus() {
|
||||||
|
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
||||||
|
System.out.println(subquery);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testQuotedApostrophe() {
|
public void testQuotedApostrophe() {
|
||||||
var subquery = parseAndGetSpecs("\"bob's cars\"");
|
var subquery = parseAndGetSpecs("\"bob's cars\"");
|
||||||
|
@@ -17,6 +17,7 @@ dependencies {
|
|||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
implementation project(':code:functions:nsfw-domain-filter')
|
||||||
implementation project(':code:functions:search-query:api')
|
implementation project(':code:functions:search-query:api')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
@@ -2,11 +2,13 @@ package nu.marginalia.index.api;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import io.prometheus.client.Counter;
|
||||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||||
import nu.marginalia.db.DomainBlacklistImpl;
|
import nu.marginalia.db.DomainBlacklistImpl;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
@@ -16,27 +18,38 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.CompletableFuture;
|
import java.util.concurrent.CompletableFuture;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import static java.lang.Math.clamp;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class IndexClient {
|
public class IndexClient {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||||
private final DomainBlacklistImpl blacklist;
|
private final DomainBlacklistImpl blacklist;
|
||||||
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor();
|
private final NsfwDomainFilter nsfwDomainFilter;
|
||||||
|
|
||||||
|
Counter wmsa_index_query_count = Counter.build()
|
||||||
|
.name("wmsa_nsfw_filter_result_count")
|
||||||
|
.labelNames("tier")
|
||||||
|
.help("Count of results filtered by NSFW tier")
|
||||||
|
.register();
|
||||||
|
|
||||||
|
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||||
|
DomainBlacklistImpl blacklist,
|
||||||
|
NsfwDomainFilter nsfwDomainFilter
|
||||||
|
) {
|
||||||
this.channelPool = channelPoolFactory.createMulti(
|
this.channelPool = channelPoolFactory.createMulti(
|
||||||
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
||||||
IndexApiGrpc::newBlockingStub);
|
IndexApiGrpc::newBlockingStub);
|
||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
|
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Comparator<RpcDecoratedResultItem> comparator =
|
private static final Comparator<RpcDecoratedResultItem> comparator =
|
||||||
@@ -51,44 +64,56 @@ public class IndexClient {
|
|||||||
|
|
||||||
/** Execute a query on the index partitions and return the combined results. */
|
/** Execute a query on the index partitions and return the combined results. */
|
||||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||||
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
|
|
||||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
|
||||||
.async(executor)
|
|
||||||
.runEach(indexRequest);
|
|
||||||
|
|
||||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||||
final int resultsUpperBound = requestedMaxResults * channelPool.getNumNodes();
|
int filterTier = indexRequest.getNsfwFilterTierValue();
|
||||||
|
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||||
|
|
||||||
List<RpcDecoratedResultItem> results = new ArrayList<>(resultsUpperBound);
|
List<RpcDecoratedResultItem> results =
|
||||||
|
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||||
|
.async(executor)
|
||||||
|
.runEach(indexRequest)
|
||||||
|
.stream()
|
||||||
|
.map(future -> future.thenApply(iterator -> {
|
||||||
|
List<RpcDecoratedResultItem> ret = new ArrayList<>(requestedMaxResults);
|
||||||
|
iterator.forEachRemaining(ret::add);
|
||||||
|
totalNumResults.addAndGet(ret.size());
|
||||||
|
return ret;
|
||||||
|
}))
|
||||||
|
.mapMulti((CompletableFuture<List<RpcDecoratedResultItem>> fut, Consumer<List<RpcDecoratedResultItem>> c) ->{
|
||||||
|
try {
|
||||||
|
c.accept(fut.join());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error while fetching results", e);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.filter(item -> !isBlacklisted(item, filterTier))
|
||||||
|
.sorted(comparator)
|
||||||
|
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||||
|
.limit(pagination.pageSize)
|
||||||
|
.toList();
|
||||||
|
|
||||||
for (var future : futures) {
|
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||||
try {
|
|
||||||
future.get().forEachRemaining(results::add);
|
|
||||||
}
|
|
||||||
catch (Exception e) {
|
|
||||||
logger.error("Downstream exception", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort the results by ranking score and remove blacklisted domains
|
|
||||||
results.sort(comparator);
|
|
||||||
results.removeIf(this::isBlacklisted);
|
|
||||||
|
|
||||||
int numReceivedResults = results.size();
|
|
||||||
|
|
||||||
// pagination is typically 1-indexed, so we need to adjust the start and end indices
|
|
||||||
int indexStart = (pagination.page - 1) * pagination.pageSize;
|
|
||||||
int indexEnd = (pagination.page) * pagination.pageSize;
|
|
||||||
|
|
||||||
results = results.subList(
|
|
||||||
clamp(indexStart, 0, Math.max(0, results.size() - 1)), // from is inclusive, so subtract 1 from size()
|
|
||||||
clamp(indexEnd, 0, results.size()));
|
|
||||||
|
|
||||||
return new AggregateQueryResponse(results, pagination.page(), numReceivedResults);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
static String[] tierNames = {
|
||||||
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
|
"OFF",
|
||||||
|
"DANGER",
|
||||||
|
"NSFW"
|
||||||
|
};
|
||||||
|
|
||||||
|
private boolean isBlacklisted(RpcDecoratedResultItem item, int filterTier) {
|
||||||
|
int domainId = UrlIdCodec.getDomainId(item.getRawItem().getCombinedId());
|
||||||
|
|
||||||
|
if (blacklist.isBlacklisted(domainId)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (nsfwDomainFilter.isBlocked(domainId, filterTier)) {
|
||||||
|
wmsa_index_query_count.labels(tierNames[filterTier]).inc();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
|
|||||||
|
|
||||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||||
|
|
||||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
|
||||||
for (var instance : journal.pages()) {
|
for (var instance : journal.pages()) {
|
||||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
||||||
{
|
{
|
||||||
|
@@ -10,12 +10,12 @@ import it.unimi.dsi.fastutil.longs.LongArrayList;
|
|||||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||||
|
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
|
||||||
import nu.marginalia.array.page.LongQueryBuffer;
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
@@ -211,7 +211,7 @@ public class IndexGrpcService
|
|||||||
/** This class is responsible for ranking the results and adding the best results to the
|
/** This class is responsible for ranking the results and adding the best results to the
|
||||||
* resultHeap, which depending on the state of the indexLookup threads may or may not block
|
* resultHeap, which depending on the state of the indexLookup threads may or may not block
|
||||||
*/
|
*/
|
||||||
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams,
|
private ResultRankingContext createRankingContext(RpcResultRankingParameters rankingParams,
|
||||||
CompiledQuery<String> compiledQuery,
|
CompiledQuery<String> compiledQuery,
|
||||||
CompiledQueryLong compiledQueryIds)
|
CompiledQueryLong compiledQueryIds)
|
||||||
{
|
{
|
||||||
|
@@ -2,12 +2,13 @@ package nu.marginalia.index.model;
|
|||||||
|
|
||||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||||
|
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||||
import nu.marginalia.index.query.IndexSearchBudget;
|
import nu.marginalia.index.query.IndexSearchBudget;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
@@ -23,7 +24,7 @@ public class SearchParameters {
|
|||||||
public final IndexSearchBudget budget;
|
public final IndexSearchBudget budget;
|
||||||
public final SearchQuery query;
|
public final SearchQuery query;
|
||||||
public final QueryParams queryParams;
|
public final QueryParams queryParams;
|
||||||
public final ResultRankingParameters rankingParams;
|
public final RpcResultRankingParameters rankingParams;
|
||||||
|
|
||||||
public final int limitByDomain;
|
public final int limitByDomain;
|
||||||
public final int limitTotal;
|
public final int limitTotal;
|
||||||
@@ -41,11 +42,11 @@ public class SearchParameters {
|
|||||||
public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
|
public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
|
||||||
var limits = specsSet.queryLimits;
|
var limits = specsSet.queryLimits;
|
||||||
|
|
||||||
this.fetchSize = limits.fetchSize();
|
this.fetchSize = limits.getFetchSize();
|
||||||
this.budget = new IndexSearchBudget(limits.timeoutMs());
|
this.budget = new IndexSearchBudget(limits.getTimeoutMs());
|
||||||
this.query = specsSet.query;
|
this.query = specsSet.query;
|
||||||
this.limitByDomain = limits.resultsByDomain();
|
this.limitByDomain = limits.getResultsByDomain();
|
||||||
this.limitTotal = limits.resultsTotal();
|
this.limitTotal = limits.getResultsTotal();
|
||||||
|
|
||||||
queryParams = new QueryParams(
|
queryParams = new QueryParams(
|
||||||
specsSet.quality,
|
specsSet.quality,
|
||||||
@@ -62,17 +63,17 @@ public class SearchParameters {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public SearchParameters(RpcIndexQuery request, SearchSet searchSet) {
|
public SearchParameters(RpcIndexQuery request, SearchSet searchSet) {
|
||||||
var limits = IndexProtobufCodec.convertQueryLimits(request.getQueryLimits());
|
var limits = request.getQueryLimits();
|
||||||
|
|
||||||
this.fetchSize = limits.fetchSize();
|
this.fetchSize = limits.getFetchSize();
|
||||||
|
|
||||||
// The time budget is halved because this is the point when we start to
|
// The time budget is halved because this is the point when we start to
|
||||||
// wrap up the search and return the results.
|
// wrap up the search and return the results.
|
||||||
this.budget = new IndexSearchBudget(limits.timeoutMs() / 2);
|
this.budget = new IndexSearchBudget(limits.getTimeoutMs() / 2);
|
||||||
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
|
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
|
||||||
|
|
||||||
this.limitByDomain = limits.resultsByDomain();
|
this.limitByDomain = limits.getResultsByDomain();
|
||||||
this.limitTotal = limits.resultsTotal();
|
this.limitTotal = limits.getResultsTotal();
|
||||||
|
|
||||||
queryParams = new QueryParams(
|
queryParams = new QueryParams(
|
||||||
convertSpecLimit(request.getQuality()),
|
convertSpecLimit(request.getQuality()),
|
||||||
@@ -85,7 +86,7 @@ public class SearchParameters {
|
|||||||
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||||
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||||
|
|
||||||
rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters());
|
rankingParams = request.hasParameters() ? request.getParameters() : PrototypeRankingParameters.sensibleDefaults();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.index.results;
|
|||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
@@ -24,14 +23,14 @@ public class Bm25GraphVisitor implements CqExpression.DoubleVisitor {
|
|||||||
|
|
||||||
private final BitSet mask;
|
private final BitSet mask;
|
||||||
|
|
||||||
public Bm25GraphVisitor(Bm25Parameters bm25Parameters,
|
public Bm25GraphVisitor(double k1, double b,
|
||||||
float[] counts,
|
float[] counts,
|
||||||
int length,
|
int length,
|
||||||
ResultRankingContext ctx) {
|
ResultRankingContext ctx) {
|
||||||
this.length = length;
|
this.length = length;
|
||||||
|
|
||||||
this.k1 = bm25Parameters.k();
|
this.k1 = k1;
|
||||||
this.b = bm25Parameters.b();
|
this.b = b;
|
||||||
|
|
||||||
this.docCount = ctx.termFreqDocCount();
|
this.docCount = ctx.termFreqDocCount();
|
||||||
this.counts = counts;
|
this.counts = counts;
|
||||||
|
@@ -0,0 +1,119 @@
|
|||||||
|
package nu.marginalia.index.results;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import gnu.trove.map.hash.TIntDoubleHashMap;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.OptionalInt;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class DomainRankingOverrides {
|
||||||
|
private final DbDomainQueries domainQueries;
|
||||||
|
|
||||||
|
private volatile TIntDoubleHashMap rankingFactors = new TIntDoubleHashMap(100, 0.75f, -1, 1.);
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomainRankingOverrides.class);
|
||||||
|
|
||||||
|
private final Path overrideFilePath;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomainRankingOverrides(DbDomainQueries domainQueries) {
|
||||||
|
this.domainQueries = domainQueries;
|
||||||
|
|
||||||
|
overrideFilePath = WmsaHome.getDataPath().resolve("domain-ranking-factors.txt");
|
||||||
|
|
||||||
|
Thread.ofPlatform().start(this::updateRunner);
|
||||||
|
}
|
||||||
|
|
||||||
|
// for test access
|
||||||
|
public DomainRankingOverrides(DbDomainQueries domainQueries, Path overrideFilePath)
|
||||||
|
{
|
||||||
|
this.domainQueries = domainQueries;
|
||||||
|
this.overrideFilePath = overrideFilePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getRankingFactor(int domainId) {
|
||||||
|
return rankingFactors.get(domainId);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateRunner() {
|
||||||
|
for (;;) {
|
||||||
|
reloadFile();
|
||||||
|
|
||||||
|
try {
|
||||||
|
TimeUnit.MINUTES.sleep(5);
|
||||||
|
} catch (InterruptedException ex) {
|
||||||
|
logger.warn("Thread interrupted", ex);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void reloadFile() {
|
||||||
|
if (!Files.exists(overrideFilePath)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<String> lines = Files.readAllLines(overrideFilePath);
|
||||||
|
|
||||||
|
double factor = 1.;
|
||||||
|
|
||||||
|
var newRankingFactors = new TIntDoubleHashMap(lines.size(), 0.75f, -1, 1.);
|
||||||
|
|
||||||
|
for (var line : lines) {
|
||||||
|
if (line.isBlank()) continue;
|
||||||
|
if (line.startsWith("#")) continue;
|
||||||
|
|
||||||
|
String[] parts = line.split("\\s+");
|
||||||
|
if (parts.length != 2) {
|
||||||
|
logger.warn("Unrecognized format for domain overrides file: {}", line);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
switch (parts[0]) {
|
||||||
|
case "value" -> {
|
||||||
|
// error handle me
|
||||||
|
factor = Double.parseDouble(parts[1]);
|
||||||
|
if (factor < 0) {
|
||||||
|
logger.error("Negative values are not permitted, found {}", factor);
|
||||||
|
factor = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "domain" -> {
|
||||||
|
// error handle
|
||||||
|
OptionalInt domainId = domainQueries.tryGetDomainId(new EdgeDomain(parts[1]));
|
||||||
|
if (domainId.isPresent()) {
|
||||||
|
newRankingFactors.put(domainId.getAsInt(), factor);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
logger.warn("Unrecognized domain id {}", parts[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default -> {
|
||||||
|
logger.warn("Unrecognized format {}", line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.warn("Error in parsing domain overrides file: {} ({})", line, ex.getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rankingFactors = newRankingFactors;
|
||||||
|
} catch (IOException ex) {
|
||||||
|
logger.error("Failed to read " + overrideFilePath, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -40,13 +40,16 @@ public class IndexResultRankingService {
|
|||||||
|
|
||||||
private final DocumentDbReader documentDbReader;
|
private final DocumentDbReader documentDbReader;
|
||||||
private final StatefulIndex statefulIndex;
|
private final StatefulIndex statefulIndex;
|
||||||
|
private final DomainRankingOverrides domainRankingOverrides;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexResultRankingService(DocumentDbReader documentDbReader,
|
public IndexResultRankingService(DocumentDbReader documentDbReader,
|
||||||
StatefulIndex statefulIndex)
|
StatefulIndex statefulIndex,
|
||||||
|
DomainRankingOverrides domainRankingOverrides)
|
||||||
{
|
{
|
||||||
this.documentDbReader = documentDbReader;
|
this.documentDbReader = documentDbReader;
|
||||||
this.statefulIndex = statefulIndex;
|
this.statefulIndex = statefulIndex;
|
||||||
|
this.domainRankingOverrides = domainRankingOverrides;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<SearchResultItem> rankResults(SearchParameters params,
|
public List<SearchResultItem> rankResults(SearchParameters params,
|
||||||
@@ -57,7 +60,7 @@ public class IndexResultRankingService {
|
|||||||
if (resultIds.isEmpty())
|
if (resultIds.isEmpty())
|
||||||
return List.of();
|
return List.of();
|
||||||
|
|
||||||
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, rankingContext, params);
|
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext, params);
|
||||||
|
|
||||||
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
||||||
|
|
||||||
@@ -156,7 +159,7 @@ public class IndexResultRankingService {
|
|||||||
// for the selected results, as this would be comically expensive to do for all the results we
|
// for the selected results, as this would be comically expensive to do for all the results we
|
||||||
// discard along the way
|
// discard along the way
|
||||||
|
|
||||||
if (params.rankingParams.exportDebugData) {
|
if (params.rankingParams.getExportDebugData()) {
|
||||||
var combinedIdsList = new LongArrayList(resultsList.size());
|
var combinedIdsList = new LongArrayList(resultsList.size());
|
||||||
for (var item : resultsList) {
|
for (var item : resultsList) {
|
||||||
combinedIdsList.add(item.combinedId);
|
combinedIdsList.add(item.combinedId);
|
||||||
|
@@ -2,10 +2,11 @@ package nu.marginalia.index.results;
|
|||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||||
|
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
@@ -40,14 +41,17 @@ public class IndexResultScoreCalculator {
|
|||||||
private final CombinedIndexReader index;
|
private final CombinedIndexReader index;
|
||||||
private final QueryParams queryParams;
|
private final QueryParams queryParams;
|
||||||
|
|
||||||
|
private final DomainRankingOverrides domainRankingOverrides;
|
||||||
private final ResultRankingContext rankingContext;
|
private final ResultRankingContext rankingContext;
|
||||||
private final CompiledQuery<String> compiledQuery;
|
private final CompiledQuery<String> compiledQuery;
|
||||||
|
|
||||||
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
||||||
|
DomainRankingOverrides domainRankingOverrides,
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
SearchParameters params)
|
SearchParameters params)
|
||||||
{
|
{
|
||||||
this.index = statefulIndex.get();
|
this.index = statefulIndex.get();
|
||||||
|
this.domainRankingOverrides = domainRankingOverrides;
|
||||||
this.rankingContext = rankingContext;
|
this.rankingContext = rankingContext;
|
||||||
|
|
||||||
this.queryParams = params.queryParams;
|
this.queryParams = params.queryParams;
|
||||||
@@ -116,20 +120,20 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
float proximitiyFac = getProximitiyFac(decodedPositions, searchTerms.phraseConstraints, verbatimMatches, unorderedMatches, spans);
|
float proximitiyFac = getProximitiyFac(decodedPositions, searchTerms.phraseConstraints, verbatimMatches, unorderedMatches, spans);
|
||||||
|
|
||||||
double score_firstPosition = params.tcfFirstPosition * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
|
double score_firstPosition = params.getTcfFirstPositionWeight() * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
|
||||||
double score_verbatim = params.tcfVerbatim * verbatimMatches.getScore();
|
double score_verbatim = params.getTcfVerbatimWeight() * verbatimMatches.getScore();
|
||||||
double score_proximity = params.tcfProximity * proximitiyFac;
|
double score_proximity = params.getTcfProximityWeight() * proximitiyFac;
|
||||||
double score_bM25 = params.bm25Weight
|
double score_bM25 = params.getBm25Weight()
|
||||||
* wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.bm25Params, unorderedMatches.getWeightedCounts(), docSize, rankingContext))
|
* wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.getBm25K(), params.getBm25B(), unorderedMatches.getWeightedCounts(), docSize, rankingContext))
|
||||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||||
double score_bFlags = params.bm25Weight
|
double score_bFlags = params.getBm25Weight()
|
||||||
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.bm25Params, wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
|
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
|
||||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||||
|
|
||||||
|
double rankingAdjustment = domainRankingOverrides.getRankingFactor(UrlIdCodec.getDomainId(combinedId));
|
||||||
|
|
||||||
double score = normalize(
|
double score = normalize(
|
||||||
score_firstPosition + score_proximity + score_verbatim
|
rankingAdjustment * (score_firstPosition + score_proximity + score_verbatim + score_bM25 + score_bFlags),
|
||||||
+ score_bM25
|
|
||||||
+ score_bFlags,
|
|
||||||
-Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
|
-Math.min(0, documentBonus) // The magnitude of documentBonus, if it is negative; otherwise 0
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -245,9 +249,13 @@ public class IndexResultScoreCalculator {
|
|||||||
private double calculateDocumentBonus(long documentMetadata,
|
private double calculateDocumentBonus(long documentMetadata,
|
||||||
int features,
|
int features,
|
||||||
int length,
|
int length,
|
||||||
ResultRankingParameters rankingParams,
|
RpcResultRankingParameters rankingParams,
|
||||||
@Nullable DebugRankingFactors debugRankingFactors) {
|
@Nullable DebugRankingFactors debugRankingFactors) {
|
||||||
|
|
||||||
|
if (rankingParams.getDisablePenalties()) {
|
||||||
|
return 0.;
|
||||||
|
}
|
||||||
|
|
||||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||||
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
||||||
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
||||||
@@ -256,18 +264,18 @@ public class IndexResultScoreCalculator {
|
|||||||
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
||||||
int year = DocumentMetadata.decodeYear(documentMetadata);
|
int year = DocumentMetadata.decodeYear(documentMetadata);
|
||||||
|
|
||||||
double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
|
double averageSentenceLengthPenalty = (asl >= rankingParams.getShortSentenceThreshold() ? 0 : -rankingParams.getShortSentencePenalty());
|
||||||
|
|
||||||
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
|
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
|
||||||
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
|
final double rankingBonus = (255. - rank) * rankingParams.getDomainRankBonus();
|
||||||
final double topologyBonus = Math.log(1 + topology);
|
final double topologyBonus = Math.log(1 + topology);
|
||||||
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
|
final double documentLengthPenalty = length > rankingParams.getShortDocumentThreshold() ? 0 : -rankingParams.getShortDocumentPenalty();
|
||||||
final double temporalBias;
|
final double temporalBias;
|
||||||
|
|
||||||
if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) {
|
if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.RECENT) {
|
||||||
temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight;
|
temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.getTemporalBiasWeight();
|
||||||
} else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) {
|
} else if (rankingParams.getTemporalBias().getBias() == RpcTemporalBias.Bias.OLD) {
|
||||||
temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight;
|
temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.getTemporalBiasWeight();
|
||||||
} else {
|
} else {
|
||||||
temporalBias = 0;
|
temporalBias = 0;
|
||||||
}
|
}
|
||||||
@@ -506,14 +514,14 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
private double calculateQualityPenalty(int size, int quality, RpcResultRankingParameters rankingParams) {
|
||||||
if (size < 400) {
|
if (size < 400) {
|
||||||
if (quality < 5)
|
if (quality < 5)
|
||||||
return 0;
|
return 0;
|
||||||
return -quality * rankingParams.qualityPenalty;
|
return -quality * rankingParams.getQualityPenalty();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return -quality * rankingParams.qualityPenalty * 20;
|
return -quality * rankingParams.getQualityPenalty() * 20;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -575,3 +583,4 @@ public class IndexResultScoreCalculator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -3,7 +3,6 @@ package nu.marginalia.index.results;
|
|||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
||||||
@@ -15,15 +14,14 @@ public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor {
|
|||||||
private final CqDataLong wordMetaData;
|
private final CqDataLong wordMetaData;
|
||||||
private final CqDataInt frequencies;
|
private final CqDataInt frequencies;
|
||||||
private final float[] counts;
|
private final float[] counts;
|
||||||
private final Bm25Parameters bm25Parameters;
|
private final double k1;
|
||||||
|
|
||||||
private final int docCount;
|
private final int docCount;
|
||||||
|
|
||||||
public TermFlagsGraphVisitor(Bm25Parameters bm25Parameters,
|
public TermFlagsGraphVisitor(double k1,
|
||||||
CqDataLong wordMetaData,
|
CqDataLong wordMetaData,
|
||||||
float[] counts,
|
float[] counts,
|
||||||
ResultRankingContext ctx) {
|
ResultRankingContext ctx) {
|
||||||
this.bm25Parameters = bm25Parameters;
|
this.k1 = k1;
|
||||||
this.counts = counts;
|
this.counts = counts;
|
||||||
this.docCount = ctx.termFreqDocCount();
|
this.docCount = ctx.termFreqDocCount();
|
||||||
this.wordMetaData = wordMetaData;
|
this.wordMetaData = wordMetaData;
|
||||||
@@ -55,7 +53,7 @@ public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor {
|
|||||||
int freq = frequencies.get(idx);
|
int freq = frequencies.get(idx);
|
||||||
|
|
||||||
// note we override b to zero for priority terms as they are independent of document length
|
// note we override b to zero for priority terms as they are independent of document length
|
||||||
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
return invFreq(docCount, freq) * f(k1, 0, count, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private double evaluatePriorityScore(int idx) {
|
private double evaluatePriorityScore(int idx) {
|
||||||
|
@@ -1,7 +0,0 @@
|
|||||||
package nu.marginalia.index.query.limit;
|
|
||||||
|
|
||||||
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
|
|
||||||
public QueryLimits forSingleDomain() {
|
|
||||||
return new QueryLimits(resultsTotal, resultsTotal, timeoutMs, fetchSize);
|
|
||||||
}
|
|
||||||
}
|
|
@@ -4,10 +4,11 @@ import com.google.inject.Guice;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||||
|
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||||
@@ -17,7 +18,6 @@ import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
|||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||||
@@ -115,9 +115,16 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
SearchSpecification.builder()
|
SearchSpecification.builder()
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
.queryLimits(
|
||||||
|
RpcQueryLimits.newBuilder()
|
||||||
|
.setResultsByDomain(10)
|
||||||
|
.setResultsTotal(10)
|
||||||
|
.setTimeoutMs(Integer.MAX_VALUE)
|
||||||
|
.setFetchSize(4000)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||||
.domains(new ArrayList<>())
|
.domains(new ArrayList<>())
|
||||||
.searchSetIdentifier("NONE")
|
.searchSetIdentifier("NONE")
|
||||||
.query(
|
.query(
|
||||||
@@ -171,9 +178,16 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
SearchSpecification.builder()
|
SearchSpecification.builder()
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
.queryLimits(
|
||||||
|
RpcQueryLimits.newBuilder()
|
||||||
|
.setResultsByDomain(10)
|
||||||
|
.setResultsTotal(10)
|
||||||
|
.setTimeoutMs(Integer.MAX_VALUE)
|
||||||
|
.setFetchSize(4000)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||||
.domains(new ArrayList<>())
|
.domains(new ArrayList<>())
|
||||||
.searchSetIdentifier("NONE")
|
.searchSetIdentifier("NONE")
|
||||||
.query(
|
.query(
|
||||||
@@ -225,8 +239,15 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
SearchSpecification.builder()
|
SearchSpecification.builder()
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
.queryLimits(
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
RpcQueryLimits.newBuilder()
|
||||||
|
.setResultsByDomain(10)
|
||||||
|
.setResultsTotal(10)
|
||||||
|
.setTimeoutMs(Integer.MAX_VALUE)
|
||||||
|
.setFetchSize(4000)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
|
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.domains(List.of(2))
|
.domains(List.of(2))
|
||||||
.query(
|
.query(
|
||||||
@@ -282,11 +303,18 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
SearchSpecification.builder()
|
SearchSpecification.builder()
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
.queryLimits(
|
||||||
|
RpcQueryLimits.newBuilder()
|
||||||
|
.setResultsByDomain(10)
|
||||||
|
.setResultsTotal(10)
|
||||||
|
.setTimeoutMs(Integer.MAX_VALUE)
|
||||||
|
.setFetchSize(4000)
|
||||||
|
.build()
|
||||||
|
)
|
||||||
.year(SpecificationLimit.equals(1998))
|
.year(SpecificationLimit.equals(1998))
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.searchSetIdentifier("NONE")
|
.searchSetIdentifier("NONE")
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
.rankingParams(PrototypeRankingParameters.sensibleDefaults())
|
||||||
.query(
|
.query(
|
||||||
SearchQuery.builder()
|
SearchQuery.builder()
|
||||||
.compiledQuery("4")
|
.compiledQuery("4")
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user