mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
462 Commits
deploy-021
...
export-dom
Author | SHA1 | Date | |
---|---|---|---|
|
3406b3c1e9 | ||
|
f2eaf5188d | ||
|
adc815e282 | ||
|
ca8455e049 | ||
|
4ea724d2cb | ||
|
40600e7297 | ||
|
7795742538 | ||
|
82d33ce69b | ||
|
e49cc5c244 | ||
|
0af389ad93 | ||
|
48791f56bd | ||
|
be83726427 | ||
|
708caa8791 | ||
|
32394f42b9 | ||
|
b8e3445ce0 | ||
|
17a78a7b7e | ||
|
5a75dd8093 | ||
|
a9713347a0 | ||
|
4694d36ed2 | ||
|
70bdd1f51e | ||
|
187b4828e6 | ||
|
93fc14dc94 | ||
|
fbfea8539b | ||
|
0929d77247 | ||
|
db8f8c1f55 | ||
|
dcb2723386 | ||
|
00c1f495f6 | ||
|
73a923983a | ||
|
e9ed0c5669 | ||
|
5b2bec6144 | ||
|
f26bb8e2b1 | ||
|
4455495dc6 | ||
|
b84d17aa51 | ||
|
9d008390ae | ||
|
a40c2a8146 | ||
|
a3416bf48e | ||
|
ee2461d9fc | ||
|
54c91a84e3 | ||
|
a6371fc54c | ||
|
8faa9a572d | ||
|
fdce940263 | ||
|
af8a13a7fb | ||
|
9e332de6b4 | ||
|
d457bb5d44 | ||
|
c661ebb619 | ||
|
53e744398a | ||
|
1d71baf3e5 | ||
|
bb5fc0f348 | ||
|
c8f112d040 | ||
|
ae31bc8498 | ||
|
da5046c3bf | ||
|
f67257baf2 | ||
|
924fb05661 | ||
|
c231a82062 | ||
|
2c1082d7f0 | ||
|
06947bd026 | ||
|
519aebd7c6 | ||
|
42cc27586e | ||
|
360881fafd | ||
|
4c6fdf6ebe | ||
|
554de21f68 | ||
|
00194acbfe | ||
|
97dabcefaa | ||
|
cc790644d4 | ||
|
8f893ee6c0 | ||
|
938721b793 | ||
|
f68bcefc75 | ||
|
0cfd759f85 | ||
|
b53002200c | ||
|
78246b9a63 | ||
|
b552e79927 | ||
|
bffc159486 | ||
|
1432fc87d7 | ||
|
edd453531e | ||
|
096496ada1 | ||
|
8ca6209260 | ||
|
673c65d3c9 | ||
|
acb9ec7b15 | ||
|
47079e05db | ||
|
c93056e77f | ||
|
6f7530e807 | ||
|
87ce4a1b52 | ||
|
52194cbe7a | ||
|
fd1ac03c78 | ||
|
5e5b86efb4 | ||
|
f332ec6191 | ||
|
c25c1af437 | ||
|
eb0c911b45 | ||
|
1979870ce4 | ||
|
0ba2ea38e1 | ||
|
d6cfbceeea | ||
|
e369d200cc | ||
|
946d64c8da | ||
|
42f043a60f | ||
|
b46f2e1407 | ||
|
18aa1b9764 | ||
|
2f3950e0d5 | ||
|
61d803869e | ||
|
df6434d177 | ||
|
59519ed7c4 | ||
|
874fc2d250 | ||
|
69e8ec0eef | ||
|
a7eb5f54e6 | ||
|
b29ba3e228 | ||
|
5fa5029c60 | ||
|
4257f60f00 | ||
|
ce221d3a0e | ||
|
f0741142a3 | ||
|
0899e4d895 | ||
|
bbf7c5a1cb | ||
|
686a40e69b | ||
|
8af254f44f | ||
|
2c21bd9287 | ||
|
f9645e2f00 | ||
|
81e311b558 | ||
|
507c09146a | ||
|
f682425594 | ||
|
de67006c4f | ||
|
eea32bb7b4 | ||
|
e976940a4e | ||
|
b564b33028 | ||
|
1cca16a58e | ||
|
70b4ed6d81 | ||
|
45dc6412c1 | ||
|
b3b95edcb5 | ||
|
338d300e1a | ||
|
fa685bf1f4 | ||
|
d79a3e2b2a | ||
|
854382b2be | ||
|
8710adbc2a | ||
|
acdf7b4785 | ||
|
b5d27c1406 | ||
|
55eb7dc116 | ||
|
f0e8bc8baf | ||
|
91a6ad2337 | ||
|
9a182b9ddb | ||
|
fefbcf15ce | ||
|
9a789bf62d | ||
|
0525303b68 | ||
|
6953d65de5 | ||
|
a7a18ced2e | ||
|
7c94c941b2 | ||
|
ea99b62356 | ||
|
3dc21d34d8 | ||
|
51912e0176 | ||
|
de1b4d5372 | ||
|
50ac926060 | ||
|
d711ee75b5 | ||
|
291ff0c4de | ||
|
2fd2710355 | ||
|
e3b957063d | ||
|
aee262e5f6 | ||
|
4a98a3c711 | ||
|
68f52ca350 | ||
|
2a2d951c2f | ||
|
379a1be074 | ||
|
827aadafcd | ||
|
aa7679d6ce | ||
|
6fe6de766d | ||
|
4245ac4c07 | ||
|
1c49a0f5ad | ||
|
9a6e5f646d | ||
|
fa92994a31 | ||
|
bc49406881 | ||
|
90325be447 | ||
|
dc89587af3 | ||
|
7b552afd6b | ||
|
73557edc67 | ||
|
83919e448a | ||
|
6f5b75b84d | ||
|
db315e2813 | ||
|
e9977e08b7 | ||
|
1df3757e5f | ||
|
ca283f9684 | ||
|
85360e61b2 | ||
|
e2ccff21bc | ||
|
c5b5b0c699 | ||
|
9a65946e22 | ||
|
1d2ab21e27 | ||
|
0610cc19ad | ||
|
a676306a7f | ||
|
8d68cd14fb | ||
|
4773c5a52b | ||
|
74bd562ae4 | ||
|
c9751287b0 | ||
|
5da24e3fc4 | ||
|
20a4e86eec | ||
|
477a184948 | ||
|
8940ce99db | ||
|
0ac0fa4dca | ||
|
942f15ef14 | ||
|
f668f33d5b | ||
|
6789975cd2 | ||
|
c3ba608776 | ||
|
733d2687fe | ||
|
f6daac8ed0 | ||
|
c2eeee4a06 | ||
|
3b0c701df4 | ||
|
c6fb2db43b | ||
|
9bc8fe05ae | ||
|
440ffcf6f8 | ||
|
b07709cc72 | ||
|
9a6acdcbe0 | ||
|
23b9b0bf1b | ||
|
749c8ed954 | ||
|
9f4b6939ca | ||
|
1d08e44e8d | ||
|
fc2e156e78 | ||
|
5e68a89e9f | ||
|
d380661307 | ||
|
cccdf5c329 | ||
|
f085b4ea12 | ||
|
e208f7d3ba | ||
|
b577085cb2 | ||
|
b9240476f6 | ||
|
8f50f86d0b | ||
|
e3b7ead7a9 | ||
|
9a845ba604 | ||
|
b9381f1603 | ||
|
6a60127267 | ||
|
e8ffcfbb19 | ||
|
caf0850f81 | ||
|
62e3bb675e | ||
|
4dc3e7da7a | ||
|
92b09883ec | ||
|
87082b4ef8 | ||
|
84d3f6087f | ||
|
f93ba371a5 | ||
|
5eec27c68d | ||
|
ab01576f91 | ||
|
054e5ccf44 | ||
|
4351ea5128 | ||
|
49cfa3a5e9 | ||
|
683854b23f | ||
|
e880fa8945 | ||
|
2482dc572e | ||
|
4589f11898 | ||
|
e43b6e610b | ||
|
4772117a1f | ||
|
3fc7ea521c | ||
|
4372f5af03 | ||
|
4ad89b6c75 | ||
|
ad0519e031 | ||
|
596ece1230 | ||
|
07b6e1585b | ||
|
cb5e2778eb | ||
|
8f5ea7896c | ||
|
76c398e0b1 | ||
|
4a94f04a8d | ||
|
df72f670d4 | ||
|
eaa22c2f5a | ||
|
7be173aeca | ||
|
36685bdca7 | ||
|
ad04057609 | ||
|
eb76ae22e2 | ||
|
4b858ab341 | ||
|
c6e3c8aa3b | ||
|
9128d3907c | ||
|
4ef16d13d4 | ||
|
838a5626ec | ||
|
6b426209c7 | ||
|
452b5731d9 | ||
|
c91cf49630 | ||
|
8503030f18 | ||
|
744f7d3ef7 | ||
|
215e12afe9 | ||
|
2716bce918 | ||
|
caf2e6fbb7 | ||
|
233f0acfb1 | ||
|
e3a4ff02e9 | ||
|
c786283ae1 | ||
|
a3f65ac0e0 | ||
|
aba1a32af0 | ||
|
c9c442345b | ||
|
2e126ba30e | ||
|
2087985f49 | ||
|
2b13ebd18b | ||
|
6d92c125fe | ||
|
f638cfa39a | ||
|
89447c12af | ||
|
c71fc46f04 | ||
|
f96874d828 | ||
|
583a84d5a0 | ||
|
f65b946448 | ||
|
3682815855 | ||
|
3a94357660 | ||
|
673b0d3de1 | ||
|
ea942bc664 | ||
|
7ed5083c54 | ||
|
08bb2c097b | ||
|
495fb325be | ||
|
05c25bbaec | ||
|
2a028b84f3 | ||
|
a091a23623 | ||
|
e8897acb45 | ||
|
b89ffcf2be | ||
|
dbcc9055b0 | ||
|
d9740557f4 | ||
|
0d6cd015fd | ||
|
c6034efcc8 | ||
|
76068014ad | ||
|
1c3ed67127 | ||
|
fc0cb6bd9a | ||
|
c2601bac78 | ||
|
f5641b72e9 | ||
|
36efe2e219 | ||
|
983fe3829e | ||
|
668c87aa86 | ||
|
9d3f9adb05 | ||
|
a43a1773f1 | ||
|
1e7a3a3c4f | ||
|
62b696b1c3 | ||
|
f1a900f383 | ||
|
700364b86d | ||
|
7e725ddaed | ||
|
120209e138 | ||
|
a771a5b6ce | ||
|
dac5b54128 | ||
|
6cfb143c15 | ||
|
23c818281b | ||
|
8aad253cf6 | ||
|
556d7af9dc | ||
|
b7a5219ed3 | ||
|
a23ec521fe | ||
|
fff3babc6d | ||
|
b2bfb8217c | ||
|
3b2ac414dc | ||
|
0ba6515a01 | ||
|
16c6b0f151 | ||
|
e998692900 | ||
|
eeb1695a87 | ||
|
a0ab910940 | ||
|
b9f31048d7 | ||
|
12c304289a | ||
|
6ee01dabea | ||
|
1b80e282a7 | ||
|
a65d18f1d1 | ||
|
90a1ff220b | ||
|
d6c7092335 | ||
|
b716333856 | ||
|
b504b8482c | ||
|
80da1e9ad1 | ||
|
d3f744a441 | ||
|
60fb539875 | ||
|
7f5094fedf | ||
|
45066636a5 | ||
|
e2d6898c51 | ||
|
58ef767b94 | ||
|
f9f268c67a | ||
|
f44c2bdee9 | ||
|
6fdf477c18 | ||
|
6b6e455e3f | ||
|
a3a126540c | ||
|
842b19da40 | ||
|
2a30e93bf0 | ||
|
3d998f12c0 | ||
|
cbccc2ac23 | ||
|
2cfc23f9b7 | ||
|
88fe394cdb | ||
|
f30fcebd4f | ||
|
5d885927b4 | ||
|
7622c8358e | ||
|
69ed9aef47 | ||
|
4c78c223da | ||
|
71b9935dd6 | ||
|
ad38f2fd83 | ||
|
9c47388846 | ||
|
d9ab10e33f | ||
|
e13ea7f42b | ||
|
f38daeb036 | ||
|
6e214293e5 | ||
|
52582a6d7d | ||
|
ec0e39ad32 | ||
|
6a15aee4b0 | ||
|
bd5111e8a2 | ||
|
1ecbeb0272 | ||
|
b91354925d | ||
|
3f85c9c154 | ||
|
390f053406 | ||
|
89e03d6914 | ||
|
14e0bc9f26 | ||
|
7065b46c6f | ||
|
0372190c90 | ||
|
ceaf32fb90 | ||
|
b03c43224c | ||
|
9b4ce9e9eb | ||
|
81ac02a695 | ||
|
47f624fb3b | ||
|
b57db01415 | ||
|
ce7d522608 | ||
|
18649b6ee9 | ||
|
f6417aef1a | ||
|
2aa7e376b0 | ||
|
f33bc44860 | ||
|
a2826efd44 | ||
|
c866f19cbb | ||
|
518278493b | ||
|
1ac0bab0b8 | ||
|
08b45ed10a | ||
|
f2cfb91973 | ||
|
2f79524eb3 | ||
|
3b00142c96 | ||
|
294ab19177 | ||
|
6f1659ecb2 | ||
|
982dcb28f0 | ||
|
fc686d8b2e | ||
|
69ef0f334a | ||
|
446746f3bd | ||
|
24ab8398bb | ||
|
d2ceeff4cf | ||
|
cf64214b1c | ||
|
e50d09cc01 | ||
|
bce3892ce0 | ||
|
36581b25c2 | ||
|
52ff7fb4dd | ||
|
a4e49e658a | ||
|
e2c56dc3ca | ||
|
470b866008 | ||
|
4895a2ac7a | ||
|
fd32ae9fa7 | ||
|
470651ea4c | ||
|
8d4829e783 | ||
|
1290bc15dc | ||
|
e7fa558954 | ||
|
720685bf3f | ||
|
cbec63c7da | ||
|
b03ca75785 | ||
|
184aedc071 | ||
|
0275bad281 | ||
|
fd83a9d0b8 | ||
|
d556f8ae3a | ||
|
e37559837b | ||
|
3564c4aaee | ||
|
92c54563ab | ||
|
d7a5d90b07 | ||
|
0a0e88fd6e | ||
|
b4fc0c4368 | ||
|
87ee8765b8 | ||
|
1adf4835fa | ||
|
b7b5d0bf46 | ||
|
416059adde | ||
|
db7930016a | ||
|
82456ad673 | ||
|
0882a6d9cd | ||
|
5020029c2d | ||
|
ac44d0b093 | ||
|
4b32b9b10e | ||
|
9f041d6631 | ||
|
13fb1efce4 | ||
|
c1225165b7 | ||
|
67ad7a3bbc | ||
|
ed62ec8a35 | ||
|
42b24cfa34 | ||
|
1ffaab2da6 | ||
|
5f93c7f767 | ||
|
4001c68c82 | ||
|
6b811489c5 | ||
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 |
7
.gitignore
vendored
7
.gitignore
vendored
@@ -8,3 +8,10 @@ lombok.config
|
||||
Dockerfile
|
||||
run
|
||||
jte-classes
|
||||
.classpath
|
||||
.project
|
||||
.settings
|
||||
.factorypath
|
||||
bin/
|
||||
*.log
|
||||
*.hprof
|
||||
|
@@ -48,10 +48,6 @@ filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Show favicons next to search results (COMPLETED 2025-03)
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
@@ -6,6 +6,7 @@ plugins {
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||
id 'com.adarshr.test-logger' version '4.0.0'
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
@@ -31,7 +32,10 @@ subprojects.forEach {it ->
|
||||
jvmArgs += ['--enable-preview']
|
||||
}
|
||||
it.tasks.withType(Test).configureEach {
|
||||
jvmArgs += ['--enable-preview']
|
||||
jvmArgs += ['--enable-preview',
|
||||
'--enable-native-access=ALL-UNNAMED',
|
||||
'--sun-misc-unsafe-memory-access=allow',
|
||||
'-Dsystem.uringQueueCount=1']
|
||||
}
|
||||
|
||||
// Enable reproducible builds for the entire project
|
||||
|
@@ -114,4 +114,7 @@ public class WmsaHome {
|
||||
}
|
||||
|
||||
|
||||
public static Path getLangugeConfig() {
|
||||
return getHomePath().resolve("conf/languages.xml");
|
||||
}
|
||||
}
|
||||
|
@@ -45,7 +45,7 @@ public class NodeConfigurationService {
|
||||
public List<NodeConfiguration> getAll() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
""")) {
|
||||
var rs = qs.executeQuery();
|
||||
@@ -59,6 +59,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("ACCEPT_QUERIES"),
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
@@ -75,7 +76,7 @@ public class NodeConfigurationService {
|
||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
WHERE ID=?
|
||||
""")) {
|
||||
@@ -88,6 +89,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("ACCEPT_QUERIES"),
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
@@ -102,7 +104,7 @@ public class NodeConfigurationService {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var us = conn.prepareStatement("""
|
||||
UPDATE NODE_CONFIGURATION
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, AUTO_ASSIGN_DOMAINS=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
WHERE ID=?
|
||||
"""))
|
||||
{
|
||||
@@ -110,10 +112,11 @@ public class NodeConfigurationService {
|
||||
us.setBoolean(2, config.acceptQueries());
|
||||
us.setBoolean(3, config.autoClean());
|
||||
us.setBoolean(4, config.includeInPrecession());
|
||||
us.setBoolean(5, config.keepWarcs());
|
||||
us.setBoolean(6, config.disabled());
|
||||
us.setString(7, config.profile().name());
|
||||
us.setInt(8, config.node());
|
||||
us.setBoolean(5, config.autoAssignDomains());
|
||||
us.setBoolean(6, config.keepWarcs());
|
||||
us.setBoolean(7, config.disabled());
|
||||
us.setString(8, config.profile().name());
|
||||
us.setInt(9, config.node());
|
||||
|
||||
if (us.executeUpdate() <= 0)
|
||||
throw new IllegalStateException("Failed to update configuration");
|
||||
|
@@ -5,6 +5,7 @@ public record NodeConfiguration(int node,
|
||||
boolean acceptQueries,
|
||||
boolean autoClean,
|
||||
boolean includeInPrecession,
|
||||
boolean autoAssignDomains,
|
||||
boolean keepWarcs,
|
||||
NodeProfile profile,
|
||||
boolean disabled
|
||||
|
@@ -20,9 +20,7 @@ public enum NodeProfile {
|
||||
}
|
||||
|
||||
public boolean permitBatchCrawl() {
|
||||
return isBatchCrawl() ||isMixed();
|
||||
}
|
||||
public boolean permitSideload() {
|
||||
return isMixed() || isSideload();
|
||||
return isBatchCrawl() || isMixed();
|
||||
}
|
||||
public boolean permitSideload() { return isSideload() || isMixed(); }
|
||||
}
|
||||
|
@@ -7,6 +7,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
@@ -15,7 +16,7 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
||||
<Filters>
|
||||
@@ -23,10 +24,11 @@
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -36,6 +38,16 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CONVERTER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
@@ -44,7 +56,9 @@
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="LogToFileService"/>
|
||||
<AppenderRef ref="LogToFileCrawler"/>
|
||||
<AppenderRef ref="LogToFileConverter"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -8,6 +8,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
@@ -18,6 +19,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
@@ -28,6 +30,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
@@ -38,6 +41,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
@@ -46,7 +50,7 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n</Pattern>
|
||||
@@ -57,9 +61,10 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -69,6 +74,16 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CONVERTER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
@@ -80,7 +95,9 @@
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="LogToFileService"/>
|
||||
<AppenderRef ref="LogToFileConverer"/>
|
||||
<AppenderRef ref="LogToFileCrawler"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
@@ -62,6 +63,63 @@ public class NodeConfigurationServiceTest {
|
||||
assertEquals(2, list.size());
|
||||
assertEquals(a, list.get(0));
|
||||
assertEquals(b, list.get(1));
|
||||
}
|
||||
|
||||
|
||||
// Test all the fields that are only exposed via save()
|
||||
@Test
|
||||
public void testSaveChanges() throws SQLException {
|
||||
var original = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||
|
||||
assertEquals(1, original.node());
|
||||
assertEquals("Test", original.description());
|
||||
assertFalse(original.acceptQueries());
|
||||
|
||||
var precession = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
original.autoClean(),
|
||||
original.includeInPrecession(),
|
||||
!original.autoAssignDomains(),
|
||||
original.keepWarcs(),
|
||||
original.profile(),
|
||||
original.disabled()
|
||||
);
|
||||
|
||||
nodeConfigurationService.save(precession);
|
||||
precession = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(original.autoAssignDomains(), precession.autoAssignDomains());
|
||||
|
||||
var autoClean = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
!original.autoClean(),
|
||||
original.includeInPrecession(),
|
||||
original.autoAssignDomains(),
|
||||
original.keepWarcs(),
|
||||
original.profile(),
|
||||
original.disabled()
|
||||
);
|
||||
|
||||
nodeConfigurationService.save(autoClean);
|
||||
autoClean = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(original.autoClean(), autoClean.autoClean());
|
||||
|
||||
var disabled = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
autoClean.autoClean(),
|
||||
autoClean.includeInPrecession(),
|
||||
autoClean.autoAssignDomains(),
|
||||
autoClean.keepWarcs(),
|
||||
autoClean.profile(),
|
||||
!autoClean.disabled()
|
||||
);
|
||||
nodeConfigurationService.save(disabled);
|
||||
disabled = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(autoClean.disabled(), disabled.disabled());
|
||||
}
|
||||
}
|
@@ -0,0 +1,6 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_SERIAL_NUMBER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_ISSUER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,7 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_INFORMATION table
|
||||
-- to make it easier to get more information about the SSL certificate's validity
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_CHAIN_VALID BOOLEAN DEFAULT NULL;
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_HOST_VALID BOOLEAN DEFAULT NULL;
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_DATE_VALID BOOLEAN DEFAULT NULL;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_INFORMATION;
|
@@ -0,0 +1,5 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NONE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,12 @@
|
||||
-- Table holding domains to be processed by the NDP in order to figure out whether to add them to
|
||||
-- be crawled.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS NDP_NEW_DOMAINS(
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
STATE ENUM ('NEW', 'ACCEPTED', 'REJECTED') NOT NULL DEFAULT 'NEW',
|
||||
PRIORITY INT NOT NULL DEFAULT 0,
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
CHECK_COUNT INT NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS NDP_NEW_DOMAINS__STATE_PRIORITY ON NDP_NEW_DOMAINS (STATE, PRIORITY DESC);
|
@@ -0,0 +1,3 @@
|
||||
-- Migration script to add AUTO_ASSIGN_DOMAINS column to NODE_CONFIGURATION table
|
||||
|
||||
ALTER TABLE NODE_CONFIGURATION ADD COLUMN AUTO_ASSIGN_DOMAINS BOOLEAN NOT NULL DEFAULT TRUE;
|
@@ -6,7 +6,6 @@ import com.google.inject.name.Named;
|
||||
import gnu.trove.list.TLongList;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -14,7 +13,6 @@ import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
@@ -104,7 +102,7 @@ public class DocumentDbReader {
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
FROM DOCUMENT WHERE ID = ?
|
||||
""")) {
|
||||
for (int i = 0; i < ids.size(); i++) {
|
||||
@@ -118,6 +116,7 @@ public class DocumentDbReader {
|
||||
url,
|
||||
rs.getString("TITLE"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
rs.getString("LANGUAGE"),
|
||||
rs.getDouble("QUALITY"),
|
||||
rs.getString("FORMAT"),
|
||||
rs.getInt("FEATURES"),
|
||||
|
@@ -41,8 +41,8 @@ public class DocumentDbWriter {
|
||||
public void add(List<DocdbUrlDetail> docdbUrlDetail) throws SQLException {
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
|
||||
int i = 0;
|
||||
@@ -54,15 +54,16 @@ public class DocumentDbWriter {
|
||||
|
||||
stmt.setString(3, document.title());
|
||||
stmt.setString(4, document.description());
|
||||
stmt.setInt(5, document.wordsTotal());
|
||||
stmt.setString(6, document.format());
|
||||
stmt.setInt(7, document.features());
|
||||
stmt.setLong(8, document.dataHash());
|
||||
stmt.setDouble(9, document.urlQuality());
|
||||
stmt.setString(5, document.language());
|
||||
stmt.setInt(6, document.wordsTotal());
|
||||
stmt.setString(7, document.format());
|
||||
stmt.setInt(8, document.features());
|
||||
stmt.setLong(9, document.dataHash());
|
||||
stmt.setDouble(10, document.urlQuality());
|
||||
if (document.pubYear() == null) {
|
||||
stmt.setInt(10, 0);
|
||||
stmt.setInt(11, 0);
|
||||
} else {
|
||||
stmt.setInt(10, document.pubYear());
|
||||
stmt.setInt(11, document.pubYear());
|
||||
}
|
||||
|
||||
stmt.addBatch();
|
||||
|
@@ -6,6 +6,7 @@ public record DocdbUrlDetail(long urlId,
|
||||
EdgeUrl url,
|
||||
String title,
|
||||
String description,
|
||||
String language,
|
||||
double urlQuality,
|
||||
String format,
|
||||
int features,
|
||||
|
@@ -6,6 +6,7 @@ CREATE TABLE DOCUMENT (
|
||||
STATE INT,
|
||||
TITLE TEXT NOT NULL,
|
||||
DESCRIPTION TEXT NOT NULL,
|
||||
LANGUAGE TEXT NOT NULL,
|
||||
|
||||
WORDS_TOTAL INTEGER NOT NULL,
|
||||
FORMAT TEXT NOT NULL,
|
||||
|
@@ -23,6 +23,7 @@ public class DocumentDbWriterTest {
|
||||
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
|
||||
"Test",
|
||||
"This is a test",
|
||||
"en",
|
||||
-4.,
|
||||
"XHTML",
|
||||
5,
|
||||
|
@@ -1,13 +1,12 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class EdgeDomain implements Serializable {
|
||||
public class EdgeDomain {
|
||||
|
||||
@Nonnull
|
||||
public final String subDomain;
|
||||
|
@@ -4,13 +4,12 @@ import nu.marginalia.util.QueryParams;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.Serializable;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
public class EdgeUrl implements Serializable {
|
||||
public class EdgeUrl {
|
||||
public final String proto;
|
||||
public final EdgeDomain domain;
|
||||
public final Integer port;
|
||||
|
@@ -5,13 +5,15 @@ import java.util.Collection;
|
||||
public enum HtmlFeature {
|
||||
// Note, the first 32 of these features are bit encoded in the database
|
||||
// so be sure to keep anything that's potentially important toward the top
|
||||
// of the list
|
||||
// of the list; but adding new values will shift the encoded values and break
|
||||
// binary compatibility! Scroll down for a marker where you should add new values
|
||||
// if they need to be accessible from IndexResultScoreCalculator!
|
||||
|
||||
MEDIA( "special:media"),
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING("special:tracking"),
|
||||
TRACKING_ADTECH("special:ads"), // We'll call this ads for now
|
||||
TRACKING_ADTECH("special:adtech"),
|
||||
|
||||
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
||||
LONG_URL("special:longurl"),
|
||||
@@ -30,6 +32,15 @@ public enum HtmlFeature {
|
||||
|
||||
PDF("format:pdf"),
|
||||
|
||||
POPOVER("special:popover"),
|
||||
CONSENT("special:consent"),
|
||||
SHORT_DOCUMENT("special:shorty"),
|
||||
THIRD_PARTY_REQUESTS("special:3pr"),
|
||||
|
||||
// Here! It is generally safe to add additional values here without
|
||||
// disrupting the encoded values used by the DocumentValuator
|
||||
// class in the index!
|
||||
|
||||
/** For fingerprinting and ranking */
|
||||
OPENGRAPH("special:opengraph"),
|
||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||
@@ -67,6 +78,7 @@ public enum HtmlFeature {
|
||||
|
||||
S3_FEATURE("special:s3"),
|
||||
|
||||
MISSING_DOM_SAMPLE("special:nosample"),
|
||||
UNKNOWN("special:uncategorized");
|
||||
|
||||
|
||||
@@ -83,16 +95,24 @@ public enum HtmlFeature {
|
||||
public static int encode(Collection<HtmlFeature> featuresAll) {
|
||||
int ret = 0;
|
||||
for (var feature : featuresAll) {
|
||||
if (feature.ordinal() >= 32) continue;
|
||||
|
||||
ret |= (1 << (feature.ordinal()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean hasFeature(int value, HtmlFeature feature) {
|
||||
return (value & (1<< feature.ordinal())) != 0;
|
||||
int ord = feature.ordinal();
|
||||
if (ord >= 32) return false;
|
||||
|
||||
return (value & (1<<ord)) != 0;
|
||||
}
|
||||
|
||||
public int getFeatureBit() {
|
||||
return (1<< ordinal());
|
||||
int ord = ordinal();
|
||||
if (ord >= 32) return 0;
|
||||
|
||||
return (1<<ord);
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.model.idx;
|
||||
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
|
||||
@@ -28,7 +27,6 @@ public record DocumentMetadata(int avgSentLength,
|
||||
int sets,
|
||||
int quality,
|
||||
byte flags)
|
||||
implements Serializable
|
||||
{
|
||||
|
||||
public String toString() {
|
||||
|
@@ -7,7 +7,6 @@ public enum ServiceId {
|
||||
Search("search-service"),
|
||||
Index("index-service"),
|
||||
Query("query-service"),
|
||||
Executor("executor-service"),
|
||||
|
||||
Control("control-service"),
|
||||
|
||||
|
@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.Function;
|
||||
|
||||
@Singleton
|
||||
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
|
||||
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
private static final Executor executor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
@Inject
|
||||
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.service.client;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import io.grpc.ManagedChannel;
|
||||
import io.grpc.StatusRuntimeException;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@@ -206,6 +207,11 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
|
||||
for (var e : exceptions) {
|
||||
if (e instanceof StatusRuntimeException se) {
|
||||
throw se; // Re-throw SRE as-is
|
||||
}
|
||||
|
||||
// If there are other exceptions, log them
|
||||
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
|
||||
}
|
||||
|
||||
|
@@ -5,12 +5,10 @@ import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
@@ -66,6 +64,6 @@ public interface ServiceRegistryIf {
|
||||
|
||||
void registerProcess(String processName, int nodeId);
|
||||
void deregisterProcess(String processName, int nodeId);
|
||||
void watchProcess(String processName, int nodeId, Consumer<Boolean> callback) throws Exception;
|
||||
void watchProcessAnyNode(String processName, Collection<Integer> nodes, BiConsumer<Boolean, Integer> callback) throws Exception;
|
||||
|
||||
InterProcessSemaphoreV2 getSemaphore(String name, int permits) throws Exception;
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.CuratorFramework;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
import org.apache.curator.utils.ZKPaths;
|
||||
import org.apache.zookeeper.CreateMode;
|
||||
import org.apache.zookeeper.Watcher;
|
||||
@@ -13,10 +14,11 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
@@ -283,60 +285,12 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void watchProcess(String processName, int nodeId, Consumer<Boolean> callback) throws Exception {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
public InterProcessSemaphoreV2 getSemaphore(String name, int permits) {
|
||||
if (stopped)
|
||||
throw new IllegalStateException("Service registry is stopped, cannot get semaphore " + name);
|
||||
|
||||
// first check if the path exists and call the callback accordingly
|
||||
|
||||
if (curatorFramework.checkExists().forPath(path) != null) {
|
||||
callback.accept(true);
|
||||
}
|
||||
else {
|
||||
callback.accept(false);
|
||||
}
|
||||
|
||||
curatorFramework.watchers().add()
|
||||
.usingWatcher((Watcher) change -> {
|
||||
Watcher.Event.EventType type = change.getType();
|
||||
|
||||
if (type == Watcher.Event.EventType.NodeCreated) {
|
||||
callback.accept(true);
|
||||
}
|
||||
if (type == Watcher.Event.EventType.NodeDeleted) {
|
||||
callback.accept(false);
|
||||
}
|
||||
})
|
||||
.forPath(path);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void watchProcessAnyNode(String processName, Collection<Integer> nodes, BiConsumer<Boolean, Integer> callback) throws Exception {
|
||||
|
||||
for (int node : nodes) {
|
||||
String path = "/process-locks/" + processName + "/" + node;
|
||||
|
||||
// first check if the path exists and call the callback accordingly
|
||||
if (curatorFramework.checkExists().forPath(path) != null) {
|
||||
callback.accept(true, node);
|
||||
}
|
||||
else {
|
||||
callback.accept(false, node);
|
||||
}
|
||||
|
||||
curatorFramework.watchers().add()
|
||||
.usingWatcher((Watcher) change -> {
|
||||
Watcher.Event.EventType type = change.getType();
|
||||
|
||||
if (type == Watcher.Event.EventType.NodeCreated) {
|
||||
callback.accept(true, node);
|
||||
}
|
||||
if (type == Watcher.Event.EventType.NodeDeleted) {
|
||||
callback.accept(false, node);
|
||||
}
|
||||
})
|
||||
.forPath(path);
|
||||
}
|
||||
String path = "/semaphores/" + name;
|
||||
return new InterProcessSemaphoreV2(curatorFramework, path, permits);
|
||||
}
|
||||
|
||||
/* Exposed for tests */
|
||||
|
@@ -1,9 +1,9 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.grpc.Server;
|
||||
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
|
||||
import io.grpc.netty.shaded.io.netty.channel.nio.NioEventLoopGroup;
|
||||
import io.grpc.netty.shaded.io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||
import io.grpc.netty.NettyServerBuilder;
|
||||
import io.netty.channel.nio.NioEventLoopGroup;
|
||||
import io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class GrpcServer {
|
||||
private final Server server;
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
public GrpcServer(ServiceConfiguration config,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
ServicePartition partition,
|
||||
@@ -26,13 +31,19 @@ public class GrpcServer {
|
||||
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
|
||||
|
||||
// Start the gRPC server
|
||||
|
||||
ExecutorService workExecutor = useLoom ?
|
||||
Executors.newVirtualThreadPerTaskExecutor() :
|
||||
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
|
||||
|
||||
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
|
||||
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
|
||||
.executor(workExecutor)
|
||||
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
|
||||
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
|
||||
.channelType(NioServerSocketChannel.class);
|
||||
|
||||
for (var grpcService : grpcServices) {
|
||||
|
||||
if (!grpcService.shouldRegisterService()) {
|
||||
continue;
|
||||
}
|
||||
|
@@ -125,8 +125,7 @@ public class JoobyService {
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
options.setWorkerThreads(Math.min(16, options.getWorkerThreads()));
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -66,7 +66,7 @@ public class NodeStatusWatcher {
|
||||
fileStorageService.createStorageBase("Crawl Data", Path.of("/storage"), nodeId, FileStorageBaseType.STORAGE);
|
||||
fileStorageService.createStorageBase("Work Area", Path.of("/work"), nodeId, FileStorageBaseType.WORK);
|
||||
|
||||
persistence.sendNewMessage("executor-service:"+nodeId,
|
||||
persistence.sendNewMessage("index-service:"+nodeId,
|
||||
null,
|
||||
null,
|
||||
"FIRST-BOOT",
|
||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.executor.storage.FileStorageFile;
|
||||
import nu.marginalia.executor.upload.UploadDirContents;
|
||||
import nu.marginalia.executor.upload.UploadDirItem;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
@@ -25,27 +26,37 @@ import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
||||
|
||||
@Singleton
|
||||
public class ExecutorClient {
|
||||
private final MqPersistence persistence;
|
||||
private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
|
||||
private final ServiceRegistryIf registry;
|
||||
|
||||
@Inject
|
||||
public ExecutorClient(ServiceRegistryIf registry,
|
||||
MqPersistence persistence,
|
||||
GrpcChannelPoolFactory grpcChannelPoolFactory)
|
||||
{
|
||||
this.registry = registry;
|
||||
this.persistence = persistence;
|
||||
this.channelPool = grpcChannelPoolFactory
|
||||
.createMulti(
|
||||
ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
|
||||
ExecutorApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
private long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
|
||||
return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void startFsm(int node, String actorName) {
|
||||
channelPool.call(ExecutorApiBlockingStub::startFsm)
|
||||
.forNode(node)
|
||||
@@ -96,6 +107,16 @@ public class ExecutorClient {
|
||||
.build());
|
||||
}
|
||||
|
||||
public long updateNsfwFilters() throws Exception {
|
||||
long msgId = createTrackingTokenMsg("nsfw-filters", 1, Duration.ofHours(6));
|
||||
|
||||
channelPool.call(ExecutorApiBlockingStub::updateNsfwFilters)
|
||||
.forNode(1)
|
||||
.run(RpcUpdateNsfwFilters.newBuilder().setMsgId(msgId).build());
|
||||
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public ActorRunStates getActorStates(int node) {
|
||||
try {
|
||||
var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
|
||||
@@ -168,7 +189,7 @@ public class ExecutorClient {
|
||||
String uriPath = "/transfer/file/" + fileStorage.id();
|
||||
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
|
||||
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
|
||||
if (endpoints.isEmpty()) {
|
||||
throw new RuntimeException("No endpoints for node " + fileStorage.node());
|
||||
}
|
||||
|
@@ -59,6 +59,15 @@ public class ExecutorExportClient {
|
||||
.build());
|
||||
}
|
||||
|
||||
public void exportDomSampleData() throws Exception {
|
||||
long msgId = createTrackingTokenMsg("dom-sample", 1, Duration.ofHours(6));
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportDomSampleData)
|
||||
.forNode(1)
|
||||
.run(
|
||||
RpcExportDomSampleData.newBuilder().setMsgId(msgId).build()
|
||||
);
|
||||
}
|
||||
|
||||
public long exportRssFeeds(int node, FileStorageId fid) throws Exception {
|
||||
long msgId = createTrackingTokenMsg("rss", node, Duration.ofHours(6));
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportRssFeeds)
|
||||
|
@@ -18,6 +18,8 @@ service ExecutorApi {
|
||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||
|
||||
rpc updateNsfwFilters(RpcUpdateNsfwFilters) returns (Empty) {}
|
||||
|
||||
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||
}
|
||||
|
||||
@@ -42,6 +44,7 @@ service ExecutorExportApi {
|
||||
rpc exportAtags(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportSegmentationModel(RpcExportSegmentationModel) returns (Empty) {}
|
||||
rpc exportSampleData(RpcExportSampleData) returns (Empty) {}
|
||||
rpc exportDomSampleData(RpcExportDomSampleData) returns (Empty) {}
|
||||
rpc exportRssFeeds(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportTermFrequencies(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportData(Empty) returns (Empty) {}
|
||||
@@ -66,6 +69,9 @@ message RpcExportRequest {
|
||||
int64 fileStorageId = 1;
|
||||
int64 msgId = 2;
|
||||
}
|
||||
message RpcUpdateNsfwFilters {
|
||||
int64 msgId = 1;
|
||||
}
|
||||
message RpcFileStorageIdWithDomainName {
|
||||
int64 fileStorageId = 1;
|
||||
string targetDomainName = 2;
|
||||
@@ -102,6 +108,11 @@ message RpcExportSampleData {
|
||||
string name = 3;
|
||||
string ctFilter = 4;
|
||||
}
|
||||
|
||||
message RpcExportDomSampleData {
|
||||
int64 msgId = 1;
|
||||
}
|
||||
|
||||
message RpcDownloadSampleData {
|
||||
string sampleSet = 1;
|
||||
}
|
||||
|
@@ -20,8 +20,8 @@ dependencies {
|
||||
implementation project(':code:processes:live-crawling-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:new-domain-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
@@ -33,7 +33,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:functions:language-processing')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
@@ -41,7 +41,6 @@ dependencies {
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:index:index-journal')
|
||||
|
@@ -2,10 +2,11 @@ package nu.marginalia.actor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.functions.execution.api.RpcFsmName;
|
||||
import nu.marginalia.functions.execution.api.RpcProcessId;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -14,18 +15,18 @@ import spark.Spark;
|
||||
@Singleton
|
||||
public class ActorApi {
|
||||
private final ExecutorActorControlService actors;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
private final MqPersistence mqPersistence;
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@Inject
|
||||
public ActorApi(ExecutorActorControlService actors,
|
||||
ProcessService processService,
|
||||
ProcessSpawnerService processSpawnerService,
|
||||
MqPersistence mqPersistence,
|
||||
ServiceConfiguration serviceConfiguration)
|
||||
{
|
||||
this.actors = actors;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
this.mqPersistence = mqPersistence;
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
}
|
||||
@@ -43,7 +44,7 @@ public class ActorApi {
|
||||
}
|
||||
|
||||
public Object stopProcess(RpcProcessId processId) {
|
||||
ProcessService.ProcessId id = ProcessService.translateExternalIdBase(processId.getProcessId());
|
||||
ProcessSpawnerService.ProcessId id = ProcessSpawnerService.translateExternalIdBase(processId.getProcessId());
|
||||
|
||||
try {
|
||||
String inbox = id.name().toLowerCase() + ":" + serviceConfiguration.node();
|
||||
@@ -60,7 +61,7 @@ public class ActorApi {
|
||||
}
|
||||
|
||||
}
|
||||
processService.kill(id);
|
||||
processSpawnerService.kill(id);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to stop process {}", id, ex);
|
||||
|
@@ -6,14 +6,15 @@ import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
UPDATE_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD, NodeProfile.REALTIME),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_NDP_SPAWNER(NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
@@ -21,6 +22,7 @@ public enum ExecutorActor {
|
||||
EXPORT_TERM_FREQUENCIES(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DOM_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
MIGRATE_CRAWL_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
||||
|
@@ -49,6 +49,7 @@ public class ExecutorActorControlService {
|
||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||
RestoreBackupActor restoreBackupActor,
|
||||
ConverterMonitorActor converterMonitorFSM,
|
||||
NdpMonitorActor ndpMonitorActor,
|
||||
PingMonitorActor pingMonitorActor,
|
||||
CrawlerMonitorActor crawlerMonitorActor,
|
||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||
@@ -58,6 +59,7 @@ public class ExecutorActorControlService {
|
||||
IndexConstructorMonitorActor indexConstructorMonitorActor,
|
||||
TriggerAdjacencyCalculationActor triggerAdjacencyCalculationActor,
|
||||
ExportDataActor exportDataActor,
|
||||
ExportDomSampleDataActor exportDomSampleDataActor,
|
||||
ExportAtagsActor exportAtagsActor,
|
||||
ExportFeedsActor exportFeedsActor,
|
||||
ExportSampleDataActor exportSampleDataActor,
|
||||
@@ -93,7 +95,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||
|
||||
register(ExecutorActor.PROC_NDP_SPAWNER, ndpMonitorActor);
|
||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||
|
||||
@@ -102,6 +104,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.EXPORT_DATA, exportDataActor);
|
||||
register(ExecutorActor.EXPORT_ATAGS, exportAtagsActor);
|
||||
register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor);
|
||||
register(ExecutorActor.EXPORT_DOM_SAMPLE_DATA, exportDomSampleDataActor);
|
||||
register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor);
|
||||
register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor);
|
||||
register(ExecutorActor.EXPORT_SEGMENTATION_MODEL, exportSegmentationModelActor);
|
||||
@@ -112,7 +115,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
register(ExecutorActor.UPDATE_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
|
@@ -4,11 +4,14 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.*;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.actor.state.Terminal;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -24,13 +27,13 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
||||
public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final int MAX_ATTEMPTS = 3;
|
||||
private final String inboxName;
|
||||
private final ProcessService.ProcessId processId;
|
||||
private final ProcessSpawnerService.ProcessId processId;
|
||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||
private final int node;
|
||||
|
||||
@@ -50,7 +53,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
for (;;) {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||
if (messages.isEmpty() && !processSpawnerService.isRunning(processId)) {
|
||||
synchronized (processId) {
|
||||
processId.wait(5000);
|
||||
}
|
||||
@@ -92,7 +95,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
catch (InterruptedException ex) {
|
||||
// We get this exception when the process is cancelled by the user
|
||||
|
||||
processService.kill(processId);
|
||||
processSpawnerService.kill(processId);
|
||||
setCurrentMessageToDead();
|
||||
|
||||
yield new Aborted();
|
||||
@@ -112,13 +115,13 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
public AbstractProcessSpawnerActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService,
|
||||
ProcessSpawnerService processSpawnerService,
|
||||
String inboxName,
|
||||
ProcessService.ProcessId processId) {
|
||||
ProcessSpawnerService.ProcessId processId) {
|
||||
super(gson);
|
||||
this.node = configuration.node();
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
this.inboxName = inboxName + ":" + node;
|
||||
this.processId = processId;
|
||||
}
|
||||
@@ -149,7 +152,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
processService.trigger(processId);
|
||||
processSpawnerService.trigger(processId);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Error in triggering process", e);
|
||||
error.set(true);
|
||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -17,13 +17,13 @@ public class ConverterMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public ConverterMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.CONVERTER_INBOX,
|
||||
ProcessService.ProcessId.CONVERTER);
|
||||
ProcessSpawnerService.ProcessId.CONVERTER);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -16,13 +16,13 @@ public class CrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public CrawlerMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.CRAWLER_INBOX,
|
||||
ProcessService.ProcessId.CRAWLER);
|
||||
ProcessSpawnerService.ProcessId.CRAWLER);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -16,13 +16,13 @@ public class ExportTaskMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public ExportTaskMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||
ProcessService.ProcessId.EXPORT_TASKS);
|
||||
ProcessSpawnerService.ProcessId.EXPORT_TASKS);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -17,13 +17,13 @@ public class IndexConstructorMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public IndexConstructorMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX,
|
||||
ProcessService.ProcessId.INDEX_CONSTRUCTOR);
|
||||
ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -16,13 +16,13 @@ public class LiveCrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public LiveCrawlerMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||
ProcessService.ProcessId.LIVE_CRAWLER);
|
||||
ProcessSpawnerService.ProcessId.LIVE_CRAWLER);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -17,13 +17,13 @@ public class LoaderMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public LoaderMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence, processService,
|
||||
persistence, processSpawnerService,
|
||||
ProcessInboxNames.LOADER_INBOX,
|
||||
ProcessService.ProcessId.LOADER);
|
||||
ProcessSpawnerService.ProcessId.LOADER);
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class NdpMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public NdpMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.NDP_INBOX,
|
||||
ProcessSpawnerService.ProcessId.NDP);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -3,24 +3,179 @@ package nu.marginalia.actor.proc;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.actor.state.Terminal;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mqapi.ping.PingRequest;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
|
||||
// Unlike other monitor actors, the ping monitor will not merely wait for a request
|
||||
// to be sent, but send one itself, hence we can't extend AbstractProcessSpawnerActor
|
||||
// but have to reimplement a lot of the same logic ourselves.
|
||||
@Singleton
|
||||
public class PingMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public class PingMonitorActor extends RecordActorPrototype {
|
||||
|
||||
@Inject
|
||||
public PingMonitorActor(Gson gson, ServiceConfiguration configuration, MqPersistence persistence, ProcessService processService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
ProcessInboxNames.PING_INBOX,
|
||||
ProcessService.ProcessId.PING);
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final int MAX_ATTEMPTS = 3;
|
||||
private final String inboxName;
|
||||
private final ProcessSpawnerService.ProcessId processId;
|
||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||
private final int node;
|
||||
private final Gson gson;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Monitor(int errorAttempts) implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RESTART)
|
||||
public record Run(int attempts) implements ActorStep {}
|
||||
@Terminal
|
||||
public record Aborted() implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial i -> {
|
||||
PingRequest request = new PingRequest();
|
||||
persistence.sendNewMessage(inboxName, null, null,
|
||||
"PingRequest",
|
||||
gson.toJson(request),
|
||||
null);
|
||||
|
||||
yield new Monitor(0);
|
||||
}
|
||||
case Monitor(int errorAttempts) -> {
|
||||
for (;;) {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty() && !processSpawnerService.isRunning(processId)) {
|
||||
synchronized (processId) {
|
||||
processId.wait(5000);
|
||||
}
|
||||
|
||||
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||
yield new Monitor(0);
|
||||
}
|
||||
// else continue
|
||||
} else {
|
||||
// Special: Associate this thread with the message so that we can get tracking
|
||||
MqMessageHandlerRegistry.register(messages.getFirst().msgId());
|
||||
|
||||
yield new Run(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
case Run(int attempts) -> {
|
||||
try {
|
||||
long startTime = System.currentTimeMillis();
|
||||
var exec = new TaskExecution();
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
if (exec.isError()) {
|
||||
if (attempts < MAX_ATTEMPTS)
|
||||
yield new Run(attempts + 1);
|
||||
else
|
||||
yield new Error();
|
||||
}
|
||||
else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) {
|
||||
// To avoid boot loops, we transition to error if the process
|
||||
// didn't run for longer than 1 seconds. This might happen if
|
||||
// the process crashes before it can reach the heartbeat and inbox
|
||||
// stages of execution. In this case it would not report having acted
|
||||
// on its message, and the process would be restarted forever without
|
||||
// the attempts counter incrementing.
|
||||
yield new Error("Process terminated within 1 seconds of starting");
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
// We get this exception when the process is cancelled by the user
|
||||
|
||||
processSpawnerService.kill(processId);
|
||||
setCurrentMessageToDead();
|
||||
|
||||
yield new Aborted();
|
||||
}
|
||||
|
||||
yield new Monitor(attempts);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "Spawns a(n) " + processId + " process and monitors its inbox for messages";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public PingMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessSpawnerService processSpawnerService) throws SQLException {
|
||||
super(gson);
|
||||
this.gson = gson;
|
||||
this.node = configuration.node();
|
||||
this.persistence = persistence;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||
this.processId = ProcessSpawnerService.ProcessId.PING;
|
||||
}
|
||||
|
||||
/** Sets the message to dead in the database to avoid
|
||||
* the service respawning on the same task when we
|
||||
* re-enable this actor */
|
||||
private void setCurrentMessageToDead() {
|
||||
try {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty()) // Possibly a race condition where the task is already finished
|
||||
return;
|
||||
|
||||
var theMessage = messages.iterator().next();
|
||||
persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Tried but failed to set the message for " + processId + " to dead", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Encapsulates the execution of the process in a separate thread so that
|
||||
* we can interrupt the thread if the process is cancelled */
|
||||
private class TaskExecution {
|
||||
private final AtomicBoolean error = new AtomicBoolean(false);
|
||||
public TaskExecution() throws ExecutionException, InterruptedException {
|
||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
processSpawnerService.trigger(processId);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Error in triggering process", e);
|
||||
error.set(true);
|
||||
}
|
||||
}).get(); // Wait for the process to start
|
||||
}
|
||||
|
||||
public boolean isError() {
|
||||
return error.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -8,7 +8,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit;
|
||||
public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||
|
||||
private final ServiceEventLog eventLogService;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final int node;
|
||||
@@ -49,7 +49,7 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||
var processId = heartbeat.getProcessId();
|
||||
if (null == processId) continue;
|
||||
|
||||
if (processService.isRunning(processId) && heartbeat.lastSeenMillis() < 10_000)
|
||||
if (processSpawnerService.isRunning(processId) && heartbeat.lastSeenMillis() < 10_000)
|
||||
continue;
|
||||
|
||||
flagProcessAsStopped(heartbeat);
|
||||
@@ -72,12 +72,12 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||
public ProcessLivenessMonitorActor(Gson gson,
|
||||
ServiceEventLog eventLogService,
|
||||
ServiceConfiguration configuration,
|
||||
ProcessService processService,
|
||||
ProcessSpawnerService processSpawnerService,
|
||||
HikariDataSource dataSource) {
|
||||
super(gson);
|
||||
this.node = configuration.node();
|
||||
this.eventLogService = eventLogService;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
@@ -208,8 +208,8 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||
public boolean isRunning() {
|
||||
return "RUNNING".equals(status);
|
||||
}
|
||||
public ProcessService.ProcessId getProcessId() {
|
||||
return ProcessService.translateExternalIdBase(processBase);
|
||||
public ProcessSpawnerService.ProcessId getProcessId() {
|
||||
return ProcessSpawnerService.translateExternalIdBase(processBase);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -47,6 +47,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
|
||||
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
|
||||
|
||||
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Wait(String ts) implements ActorStep {}
|
||||
@@ -57,6 +59,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
if (!insertFoundDomains) yield new Error("Domain insertion prohibited, aborting");
|
||||
|
||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||
yield new Error("Invalid node profile for RSS update");
|
||||
}
|
||||
|
@@ -3,11 +3,11 @@ package nu.marginalia.actor.task;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.state.ActorControlFlowException;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -20,13 +20,13 @@ public class ActorProcessWatcher {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ActorProcessWatcher.class);
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
|
||||
@Inject
|
||||
public ActorProcessWatcher(MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
}
|
||||
|
||||
/** Wait for a process to start, and then wait for a response from the process,
|
||||
@@ -36,7 +36,7 @@ public class ActorProcessWatcher {
|
||||
* <p>
|
||||
* When interrupted, the process is killed and the message is marked as dead.
|
||||
*/
|
||||
public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long msgId)
|
||||
public MqMessage waitResponse(MqOutbox outbox, ProcessSpawnerService.ProcessId processId, long msgId)
|
||||
throws ActorControlFlowException, InterruptedException, SQLException
|
||||
{
|
||||
// enums values only have a single instance,
|
||||
@@ -65,7 +65,7 @@ public class ActorProcessWatcher {
|
||||
// This will prevent the monitor process from attempting to respawn the process as we kill it
|
||||
|
||||
outbox.flagAsDead(msgId);
|
||||
processService.kill(processId);
|
||||
processSpawnerService.kill(processId);
|
||||
|
||||
logger.info("Process {} killed due to interrupt", processId);
|
||||
}
|
||||
@@ -94,12 +94,12 @@ public class ActorProcessWatcher {
|
||||
}
|
||||
|
||||
/** Wait the specified time for the specified process to start running (does not start the process) */
|
||||
private boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
||||
private boolean waitForProcess(ProcessSpawnerService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
||||
|
||||
// Wait for process to start
|
||||
long deadline = System.currentTimeMillis() + unit.toMillis(duration);
|
||||
while (System.currentTimeMillis() < deadline) {
|
||||
if (processService.isRunning(processId))
|
||||
if (processSpawnerService.isRunning(processId))
|
||||
return true;
|
||||
|
||||
TimeUnit.MILLISECONDS.sleep(100);
|
||||
|
@@ -12,7 +12,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.sideload.RedditSideloadHelper;
|
||||
import nu.marginalia.sideload.SideloadHelper;
|
||||
import nu.marginalia.sideload.StackExchangeSideloadHelper;
|
||||
@@ -218,7 +218,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
);
|
||||
}
|
||||
case ConvertWait(FileStorageId destFid, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId);
|
||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessSpawnerService.ProcessId.CONVERTER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Converter failed");
|
||||
|
@@ -18,7 +18,7 @@ import nu.marginalia.mqapi.index.IndexName;
|
||||
import nu.marginalia.mqapi.loading.LoadRequest;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -95,7 +95,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) when msgId < 0 ->
|
||||
new Convert(crawlId, processedId, mqConverterOutbox.sendAsync(ConvertRequest.forCrawlData(crawlId, processedId)));
|
||||
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId);
|
||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessSpawnerService.ProcessId.CONVERTER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Converter failed");
|
||||
@@ -129,7 +129,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
yield new Load(processedIds, id);
|
||||
}
|
||||
case Load(List<FileStorageId> processedIds, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, msgId);
|
||||
var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessSpawnerService.ProcessId.LOADER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Loader failed");
|
||||
@@ -165,7 +165,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
}
|
||||
case ReindexFwd(long id) when id < 0 -> new ReindexFwd(createIndex(IndexName.FORWARD));
|
||||
case ReindexFwd(long id) -> {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Forward index construction failed");
|
||||
@@ -174,7 +174,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
}
|
||||
case ReindexFull(long id) when id < 0 -> new ReindexFull(createIndex(IndexName.REVERSE_FULL));
|
||||
case ReindexFull(long id) -> {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Full index construction failed");
|
||||
@@ -183,7 +183,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
}
|
||||
case ReindexPrio(long id) when id < 0 -> new ReindexPrio(createIndex(IndexName.REVERSE_PRIO));
|
||||
case ReindexPrio(long id) -> {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Prio index construction failed");
|
||||
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@@ -76,7 +76,7 @@ public class CrawlActor extends RecordActorPrototype {
|
||||
case Crawl (long msgId, FileStorageId fid, boolean cascadeLoad) -> {
|
||||
var rsp = processWatcher.waitResponse(
|
||||
mqCrawlerOutbox,
|
||||
ProcessService.ProcessId.CRAWLER,
|
||||
ProcessSpawnerService.ProcessId.CRAWLER,
|
||||
msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -55,7 +55,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
|
@@ -0,0 +1,97 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
@Singleton
|
||||
public class ExportDomSampleDataActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final MqPersistence persistence;
|
||||
|
||||
public record Export(long responseMsgId) implements ActorStep {}
|
||||
public record Run(long responseMsgId, FileStorageId destId, long msgId) implements ActorStep {
|
||||
public Run(long responseMsgId, FileStorageId destId) {
|
||||
this(responseMsgId, destId, -1);
|
||||
}
|
||||
}
|
||||
public record Fail(long responseMsgId, String message) implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(long responseMsgId) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ACK);
|
||||
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "domain-sample-data-export", "Domain Sample Data " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Fail(responseMsgId, "Bad storage id");
|
||||
|
||||
yield new Run(responseMsgId, storage.id());
|
||||
}
|
||||
case Run(long responseMsgId, FileStorageId destId, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.domSampleData(destId));
|
||||
yield new Run(responseMsgId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
yield new Fail(responseMsgId, "Exporter failed");
|
||||
}
|
||||
else {
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
}
|
||||
case Fail(long responseMsgId, String message) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ERR);
|
||||
yield new Error(message);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export domain sample data";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ExportDomSampleDataActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
ProcessOutboxes processOutboxes,
|
||||
MqPersistence persistence,
|
||||
ActorProcessWatcher processWatcher)
|
||||
{
|
||||
super(gson);
|
||||
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||
this.storageService = storageService;
|
||||
this.persistence = persistence;
|
||||
this.processWatcher = processWatcher;
|
||||
}
|
||||
|
||||
}
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -54,7 +54,7 @@ public class ExportFeedsActor extends RecordActorPrototype {
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -52,7 +52,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||
}
|
||||
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -52,7 +52,7 @@ public class ExportTermFreqActor extends RecordActorPrototype {
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
@@ -44,7 +44,6 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
logger.info("{}", self);
|
||||
return switch (self) {
|
||||
case Initial() -> {
|
||||
yield new Monitor("-");
|
||||
@@ -75,7 +74,7 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
yield new LiveCrawl(feedsHash, id);
|
||||
}
|
||||
case LiveCrawl(String feedsHash, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessService.ProcessId.LIVE_CRAWLER, msgId);
|
||||
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessSpawnerService.ProcessId.LIVE_CRAWLER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Crawler failed");
|
||||
|
@@ -11,7 +11,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@@ -51,7 +51,7 @@ public class RecrawlSingleDomainActor extends RecordActorPrototype {
|
||||
case Crawl (long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(
|
||||
mqCrawlerOutbox,
|
||||
ProcessService.ProcessId.CRAWLER,
|
||||
ProcessSpawnerService.ProcessId.CRAWLER,
|
||||
msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -34,7 +34,7 @@ public class TriggerAdjacencyCalculationActor extends RecordActorPrototype {
|
||||
yield new Run(newMsgId);
|
||||
}
|
||||
case Run(long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Exporter failed");
|
||||
|
@@ -5,6 +5,8 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@@ -12,23 +14,26 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
||||
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
private final MqPersistence persistence;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
public record Run() implements ActorStep {}
|
||||
public record Initial(long respondMsgId) implements ActorStep {}
|
||||
public record Run(long respondMsgId) implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
case Initial(long respondMsgId) -> {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
persistence.updateMessageState(respondMsgId, MqMessageState.ERR);
|
||||
yield new Error("This actor can only run on node 1");
|
||||
}
|
||||
else {
|
||||
yield new Run();
|
||||
yield new Run(respondMsgId);
|
||||
}
|
||||
}
|
||||
case Run() -> {
|
||||
case Run(long respondMsgId) -> {
|
||||
nsfwDomainFilter.fetchLists();
|
||||
persistence.updateMessageState(respondMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
@@ -43,11 +48,13 @@ public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public UpdateNsfwFiltersActor(Gson gson,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NsfwDomainFilter nsfwDomainFilter)
|
||||
NsfwDomainFilter nsfwDomainFilter,
|
||||
MqPersistence persistence)
|
||||
{
|
||||
super(gson);
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -36,7 +37,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +53,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,7 +67,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,7 +81,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +99,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -38,7 +39,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,7 +58,21 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportDomSampleData(RpcExportDomSampleData request, StreamObserver<Empty> responseObserver) {
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.EXPORT_DOM_SAMPLE_DATA,
|
||||
new ExportDomSampleDataActor.Export(request.getMsgId())
|
||||
);
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,7 +88,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +102,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,7 +114,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,14 +129,14 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
responseObserver.onError(new IllegalArgumentException("Export all atags is only available on node 1"));
|
||||
responseObserver.onError(Status.UNAVAILABLE.withDescription("Export all atags is only available on node 1").asRuntimeException());
|
||||
}
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
||||
@@ -131,7 +146,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,7 +160,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,7 +174,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.actor.ActorApi;
|
||||
@@ -10,6 +11,7 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
||||
import nu.marginalia.actor.task.DownloadSampleActor;
|
||||
import nu.marginalia.actor.task.RestoreBackupActor;
|
||||
import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
|
||||
import nu.marginalia.actor.task.UpdateNsfwFiltersActor;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
@@ -57,7 +59,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,7 +71,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,7 +83,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +97,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,7 +113,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,7 +129,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,7 +204,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -228,7 +230,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -263,4 +265,19 @@ public class ExecutorGrpcService
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateNsfwFilters(RpcUpdateNsfwFilters request, StreamObserver<Empty> responseObserver) {
|
||||
logger.info("Got request {}", request);
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.UPDATE_NSFW_LISTS,
|
||||
new UpdateNsfwFiltersActor.Initial(request.getMsgId()));
|
||||
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to update nsfw filters", e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -33,7 +34,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,7 +49,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,7 +64,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,7 +79,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +94,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -5,9 +5,9 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.ConverterMain;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ndp.NdpMain;
|
||||
import nu.marginalia.ping.PingMain;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
@@ -28,7 +28,7 @@ import java.util.List;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
@Singleton
|
||||
public class ProcessService {
|
||||
public class ProcessSpawnerService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Marker processMarker = MarkerFactory.getMarker("PROCESS");
|
||||
|
||||
@@ -56,7 +56,8 @@ public class ProcessService {
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
|
||||
NDP(NdpMain.class),
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
|
||||
@@ -64,6 +65,9 @@ public class ProcessService {
|
||||
ProcessId(Class<? extends ProcessMainClass> mainClass) {
|
||||
this.mainClass = mainClass.getName();
|
||||
}
|
||||
ProcessId(String mainClassFullName) {
|
||||
this.mainClass = mainClassFullName;
|
||||
}
|
||||
|
||||
List<String> envOpts() {
|
||||
String variable = switch (this) {
|
||||
@@ -72,6 +76,7 @@ public class ProcessService {
|
||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||
case PING -> "PING_PROCESS_OPTS";
|
||||
case NDP -> "NDP_PROCESS_OPTS";
|
||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||
};
|
||||
@@ -85,7 +90,7 @@ public class ProcessService {
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ProcessService(BaseServiceParams params) {
|
||||
public ProcessSpawnerService(BaseServiceParams params) {
|
||||
this.eventLog = params.eventLog;
|
||||
this.node = params.configuration.node();
|
||||
}
|
@@ -5,6 +5,7 @@ import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.linkdb.LinkdbFileNames;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@@ -13,18 +14,18 @@ import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Map;
|
||||
|
||||
public class BackupService {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final ServiceHeartbeat serviceHeartbeat;
|
||||
|
||||
public enum BackupHeartbeatSteps {
|
||||
@@ -36,8 +37,10 @@ public class BackupService {
|
||||
|
||||
@Inject
|
||||
public BackupService(FileStorageService storageService,
|
||||
LanguageConfiguration languageConfiguration,
|
||||
ServiceHeartbeat serviceHeartbeat) {
|
||||
this.storageService = storageService;
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
}
|
||||
|
||||
@@ -98,22 +101,25 @@ public class BackupService {
|
||||
}
|
||||
|
||||
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
|
||||
{
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in input storage");
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException {
|
||||
Map<String, IndexJournal> journals = IndexJournal.findJournals(inputStorage, languageConfiguration.languages());
|
||||
for (IndexJournal journal : journals.values()) {
|
||||
FileUtils.copyDirectory(journal.journalDir().toFile(), backupStorage.resolve(journal.journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in backup");
|
||||
Map<String, IndexJournal> journals = IndexJournal.findJournals(backupStorage, languageConfiguration.languages());
|
||||
for (IndexJournal journal : journals.values()) {
|
||||
var journalFileName = journal.journalDir().getFileName();
|
||||
|
||||
// Ensure we delete any previous journal junk
|
||||
if (Files.exists(destStorage.resolve(journalFileName))) {
|
||||
FileUtils.deleteDirectory(destStorage.resolve(journalFileName).toFile());
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journalFileName).toFile(), destStorage.toFile());
|
||||
}
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
|
||||
}
|
||||
|
||||
private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.executor;
|
||||
package nu.marginalia.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.storage.FileStorageService;
|
@@ -1,5 +1,5 @@
|
||||
The execution subsystem is responsible for the execution of long running tasks on each
|
||||
index node. It lives in the [executor-service](../services-core/executor-service) module.
|
||||
index node. It lives in the [index-service](../services-core/index-service) module.
|
||||
|
||||
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
|
||||
which permits program state to survive crashes and reboots.
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.executor;
|
||||
package nu.marginalia.svc;
|
||||
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
@@ -27,10 +27,12 @@ public class DbBrowseDomainsRandom {
|
||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
SELECT EC_RANDOM_DOMAINS.DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION DAI ON DAI.DOMAIN_ID=EC_RANDOM_DOMAINS.DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
AND SERVER_AVAILABLE
|
||||
AND DOMAIN_SET=?
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY RAND()
|
||||
|
@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
@Singleton
|
||||
public class DomainInfoClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public DomainInfoClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -1,8 +1,7 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id 'jvm-test-suite'
|
||||
id 'gg.jte.gradle' version '3.1.15'
|
||||
}
|
||||
|
||||
java {
|
||||
@@ -14,18 +13,18 @@ java {
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation libs.notnull
|
||||
implementation libs.bundles.jooby
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
@@ -42,3 +41,9 @@ dependencies {
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
jte {
|
||||
sourceDirectory = file('resources/ltt/jte').toPath()
|
||||
targetDirectory = file('build/classes/jte-precompiled').toPath()
|
||||
generate()
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic.dom;
|
||||
package nu.marginalia.dom;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic.dom;
|
||||
package nu.marginalia.dom;
|
||||
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
@@ -16,8 +16,6 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
private final TermFrequencyDict dict;
|
||||
|
||||
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
@@ -37,14 +35,18 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) {
|
||||
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
if (dld.language().hasPosParsing()) {
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper();
|
||||
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, dld);
|
||||
|
||||
var titleKeywords = new TitleKeywords(dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
@@ -52,7 +54,6 @@ public class DocumentKeywordExtractor {
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
|
||||
@@ -67,6 +68,22 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
return wordsBuilder;
|
||||
}
|
||||
else {
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper();
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
return wordsBuilder;
|
||||
}
|
||||
}
|
||||
|
||||
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {
|
||||
return Stream.of(nameLikeKeywords, subjectLikeKeywords)
|
@@ -3,7 +3,9 @@ package nu.marginalia.keyword;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@@ -17,8 +19,6 @@ import static java.lang.Math.sqrt;
|
||||
*/
|
||||
public class DocumentPositionMapper {
|
||||
|
||||
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
|
||||
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData dld,
|
||||
@@ -38,12 +38,14 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
|
||||
|
||||
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||
public int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData dld)
|
||||
|
||||
{
|
||||
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
List<SpanRecorder> spanRecorders = new ArrayList<>();
|
||||
for (var htmlTag : HtmlTag.includedTags) {
|
||||
if (!htmlTag.exclude) {
|
||||
@@ -80,7 +82,7 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
for (var names : languageDefinition.matchGrammarPattern(sent, PosPatternCategory.NAME)) {
|
||||
WordRep rep = new WordRep(sent, names);
|
||||
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
|
||||
@@ -161,11 +163,15 @@ public class DocumentPositionMapper {
|
||||
|
||||
int i = 0;
|
||||
|
||||
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
for (int run = 0; run < 15 && i < s.length(); run++) {
|
||||
int cp = s.codePointAt(i);
|
||||
|
||||
|
||||
if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
|
||||
i += Character.charCount(cp);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -175,17 +181,20 @@ public class DocumentPositionMapper {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
if (wordPartSeparator.indexOf(s.codePointAt(i)) < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
for (int run = 0; run < 10 && i < s.length(); run++) {
|
||||
int cp = s.codePointAt(i);
|
||||
|
||||
if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
|
||||
i += Character.charCount(cp);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -193,48 +202,4 @@ public class DocumentPositionMapper {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Helper class to record spans of words */
|
||||
private static class SpanRecorder {
|
||||
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
|
||||
private final HtmlTag htmlTag;
|
||||
private int start = 0;
|
||||
|
||||
public SpanRecorder(HtmlTag htmlTag) {
|
||||
this.htmlTag = htmlTag;
|
||||
}
|
||||
|
||||
public void update(DocumentSentence sentence, int pos) {
|
||||
assert pos > 0;
|
||||
|
||||
if (sentence.htmlTags.contains(htmlTag)) {
|
||||
if (start <= 0) start = pos;
|
||||
}
|
||||
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
|
||||
{
|
||||
// special case for body tag, we match against no tag on the sentence
|
||||
if (start <= 0) start = pos;
|
||||
}
|
||||
else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endCurrentSpan(int pos) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
}
|
||||
}
|
@@ -6,18 +6,24 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
|
||||
import nu.marginalia.keyword.extractors.UrlKeywords;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public class KeywordMetadata {
|
||||
|
||||
@Nullable
|
||||
private final TitleKeywords titleKeywords;
|
||||
@Nullable
|
||||
private final NameLikeKeywords nameLikeKeywords;
|
||||
@Nullable
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
@Nullable
|
||||
private final UrlKeywords urlKeywords;
|
||||
|
||||
public KeywordMetadata(
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords) {
|
||||
@Nullable TitleKeywords titleKeywords,
|
||||
@Nullable NameLikeKeywords nameLikeKeywords,
|
||||
@Nullable SubjectLikeKeywords subjectLikeKeywords,
|
||||
@Nullable UrlKeywords urlKeywords) {
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
@@ -32,23 +38,23 @@ public class KeywordMetadata {
|
||||
|
||||
byte flags = 0;
|
||||
|
||||
if (subjectLikeKeywords.contains(stemmed)) {
|
||||
if (subjectLikeKeywords != null && subjectLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Subjects.asBit();
|
||||
}
|
||||
|
||||
if (nameLikeKeywords.contains(stemmed)) {
|
||||
if (nameLikeKeywords != null && nameLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.NamesWords.asBit();
|
||||
}
|
||||
|
||||
if (titleKeywords.contains(stemmed)) {
|
||||
if (titleKeywords != null && titleKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Title.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsUrl(stemmed)) {
|
||||
if (urlKeywords != null && urlKeywords.containsUrl(stemmed)) {
|
||||
flags |= WordFlags.UrlPath.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsDomain(stemmed)) {
|
||||
if (urlKeywords != null && urlKeywords.containsDomain(stemmed)) {
|
||||
flags |= WordFlags.UrlDomain.asBit();
|
||||
}
|
||||
|
@@ -0,0 +1,52 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helper class to record spans of words
|
||||
*/
|
||||
class SpanRecorder {
|
||||
private final List<DocumentWordSpan> spans = new ArrayList<>();
|
||||
private final HtmlTag htmlTag;
|
||||
private int start = 0;
|
||||
|
||||
public SpanRecorder(HtmlTag htmlTag) {
|
||||
this.htmlTag = htmlTag;
|
||||
}
|
||||
|
||||
public void update(DocumentSentence sentence, int pos) {
|
||||
assert pos > 0;
|
||||
|
||||
if (sentence.htmlTags.contains(htmlTag)) {
|
||||
if (start <= 0) start = pos;
|
||||
} else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) {
|
||||
// special case for body tag, we match against no tag on the sentence
|
||||
if (start <= 0) start = pos;
|
||||
} else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endCurrentSpan(int pos) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public List<DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
}
|
@@ -2,11 +2,11 @@ package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -16,12 +16,14 @@ public class NameLikeKeywords implements WordReps {
|
||||
private final List<WordRep> nameWords;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public NameLikeKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData dld, int minCount) {
|
||||
var counts = new Object2IntOpenHashMap<String>(100);
|
||||
var instances = new HashMap<String, HashSet<WordRep>>(100);
|
||||
public NameLikeKeywords(DocumentLanguageData dld, int minCount) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<String>(100);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<String, HashSet<WordRep>>(100);
|
||||
|
||||
for (DocumentSentence sent : dld) {
|
||||
var keywords = keywordExtractor.getProperNames(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.NAME);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1 && sent.isAllCaps(span.start))
|
||||
continue;
|
@@ -1,11 +1,11 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
@@ -23,25 +23,18 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
// Greeks bearing gifts -> Greeks
|
||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||
|
||||
public SubjectLikeKeywords(KeywordExtractor keywordExtractor,
|
||||
WordsTfIdfCounts tfIdfCounts,
|
||||
public SubjectLikeKeywords(WordsTfIdfCounts tfIdfCounts,
|
||||
DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld) {
|
||||
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
|
||||
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1))
|
||||
for (WordSpan kw : languageDefinition.matchGrammarPattern(sentence, PosPatternCategory.NOUN)) {
|
||||
if (sentence.nextCommaPos(kw.end - 1) <= kw.end)
|
||||
continue;
|
||||
|
||||
String nextTag = sentence.posTags[kw.end];
|
||||
String nextNextTag = sentence.posTags[kw.end+1];
|
||||
|
||||
if (isVerb(nextTag) && isDetOrAdverbOrVerbOrNoun(nextNextTag)) {
|
||||
if (languageDefinition.matchGrammarPattern(sentence, PosPatternCategory.SUBJECT_SUFFIX, kw.end)) {
|
||||
var span = new WordSpan(kw.start, kw.end);
|
||||
var rep = new WordRep(sentence, span);
|
||||
|
||||
@@ -94,17 +87,4 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
return tfIdfCounts.getTfIdf(stemmed);
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerbOrNoun(String posTag) {
|
||||
return "DT".equals(posTag) // determinant
|
||||
|| posTag.startsWith("RB") // adverb
|
||||
|| posTag.startsWith("VB") // verb
|
||||
|| posTag.startsWith("JJ") // adjective
|
||||
|| posTag.startsWith("P")
|
||||
|| posTag.startsWith("NN");
|
||||
}
|
||||
|
||||
boolean isVerb(String posTag) {
|
||||
return posTag.startsWith("VB")
|
||||
&& !posTag.equals("VB"); // not interested in the infinitive
|
||||
}
|
||||
}
|
@@ -1,8 +1,7 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
@@ -15,10 +14,12 @@ public class TitleKeywords implements WordReps {
|
||||
private final Set<WordRep> titleKeywords;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) {
|
||||
titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream()
|
||||
public TitleKeywords(DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
titleKeywords = dld.findSentencesForTag(HtmlTag.TITLE).stream()
|
||||
.flatMap(sent ->
|
||||
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
languageDefinition.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
.limit(100)
|
||||
.collect(Collectors.toSet());
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@@ -26,14 +26,13 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
private final Set<WordRep> tfIdfHigh;
|
||||
|
||||
public WordsTfIdfCounts(TermFrequencyDict dict,
|
||||
KeywordExtractor keywordExtractor,
|
||||
DocumentLanguageData dld) {
|
||||
this.dict = dict;
|
||||
this.docCount = dict.docCount();
|
||||
|
||||
this.tfIdf = new Object2IntOpenHashMap<>(10_000);
|
||||
this.tfIdfHigh = new HashSet<>(100);
|
||||
|
||||
var counts = getCounts(keywordExtractor, dld);
|
||||
var counts = getCounts(dld);
|
||||
int maxVal = maxValue(counts);
|
||||
Set<String> highTfIdfInstances = new HashSet<>();
|
||||
|
||||
@@ -48,9 +47,10 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
|
||||
// Collect words with a high TF-IDF so that they can be marked with a bit flag
|
||||
|
||||
tfIdfHigh = new HashSet<>(100);
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
for (var sent : dld) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.KEYWORD);
|
||||
for (var span : keywords) {
|
||||
if (highTfIdfInstances.contains(sent.constructStemmedWordFromSpan(span))) {
|
||||
tfIdfHigh.add(new WordRep(sent, span));
|
||||
@@ -60,12 +60,14 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
|
||||
}
|
||||
|
||||
private Object2IntOpenHashMap<String> getCounts(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
|
||||
private Object2IntOpenHashMap<String> getCounts(DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
counts.defaultReturnValue(0);
|
||||
|
||||
for (var sent : dld) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.KEYWORD);
|
||||
for (var span : keywords) {
|
||||
counts.addTo(sent.constructStemmedWordFromSpan(span), 1);
|
||||
}
|
@@ -0,0 +1,23 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DocumentKeywords(List<String> keywords,
|
||||
byte[] metadata,
|
||||
List<VarintCodedSequence> positions,
|
||||
byte[] spanCodes,
|
||||
List<VarintCodedSequence> spanSequences) {
|
||||
|
||||
public boolean isEmpty() {
|
||||
return keywords.isEmpty();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return keywords.size();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@@ -5,13 +5,11 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.*;
|
||||
|
||||
public class DocumentKeywordsBuilder {
|
||||
@@ -29,6 +27,7 @@ public class DocumentKeywordsBuilder {
|
||||
// be plenty. The lexicon writer has another limit that's higher.
|
||||
private final int MAX_WORD_LENGTH = 64;
|
||||
private final int MAX_POSITIONS_PER_WORD = 512;
|
||||
private final int MAX_SPANS_PER_TYPE = 8192;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DocumentKeywordsBuilder.class);
|
||||
|
||||
@@ -36,13 +35,22 @@ public class DocumentKeywordsBuilder {
|
||||
this(1600);
|
||||
}
|
||||
|
||||
public DocumentKeywords build(ByteBuffer workArea) {
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywords build() {
|
||||
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
||||
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
||||
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
||||
final List<VarintCodedSequence> spanSequences = new ArrayList<>(wordSpans.size());
|
||||
final byte[] spanCodes = new byte[wordSpans.size()];
|
||||
|
||||
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
|
||||
|
||||
// Encode positions
|
||||
while (iter.hasNext()) {
|
||||
var entry = iter.next();
|
||||
|
||||
@@ -59,27 +67,26 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
// Encode spans
|
||||
List<CodedWordSpan> spans = new ArrayList<>(wordSpans.size());
|
||||
|
||||
wordSpans.forEach((tag, spansForTag) -> {
|
||||
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
|
||||
|
||||
var positionsForTag = new IntArrayList(spansForTag.size() * 2);
|
||||
|
||||
for (var span : spansForTag) {
|
||||
positionsForTag.add(span.start());
|
||||
positionsForTag.add(span.end());
|
||||
|
||||
if (positionsForTag.size() >= MAX_SPANS_PER_TYPE)
|
||||
break;
|
||||
}
|
||||
|
||||
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
|
||||
spanCodes[spanSequences.size()] = tag.code;
|
||||
spanSequences.add(VarintCodedSequence.generate(positionsForTag));
|
||||
});
|
||||
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spanCodes, spanSequences);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
public void addMeta(String word, byte meta) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
@@ -113,6 +120,13 @@ public class DocumentKeywordsBuilder {
|
||||
newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta));
|
||||
}
|
||||
|
||||
public void addSyntheticTerm(String newWord) {
|
||||
byte meta = WordFlags.Synthetic.asBit();
|
||||
|
||||
wordToMeta.putIfAbsent(newWord, meta);
|
||||
}
|
||||
|
||||
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
@@ -167,6 +181,4 @@ public class DocumentKeywordsBuilder {
|
||||
return this.importantWords;
|
||||
}
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
||||
}
|
@@ -0,0 +1,6 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
@@ -0,0 +1,172 @@
|
||||
package nu.marginalia.language;
|
||||
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
public class LanguageProcessingTool extends Jooby {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final TermFrequencyDict termFrequencyDict;
|
||||
|
||||
static void main(String[] args) {
|
||||
Jooby.runApp(args, LanguageProcessingTool::new);
|
||||
}
|
||||
|
||||
public LanguageProcessingTool() {
|
||||
try {
|
||||
LanguageModels languageModels = getLanguageModels();
|
||||
termFrequencyDict = new TermFrequencyDict(languageModels);
|
||||
|
||||
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
|
||||
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
|
||||
languageModels
|
||||
);
|
||||
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
|
||||
System.out.println("Base path: " + basePath);
|
||||
|
||||
if (Files.exists(basePath.resolve("resources/ltt/jte")))
|
||||
install(new nu.marginalia.service.server.jte.JteModule(basePath.resolve("resources/ltt/jte")));
|
||||
if (Files.exists(basePath.resolve("resources/ltt/static")))
|
||||
assets("/*", basePath.resolve("resources/ltt/static"));
|
||||
|
||||
get("/", this::handleKeywords);
|
||||
post("/", this::handleKeywords);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to initialize LanguageProcessingTool", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
// Assign colors to the POS tags
|
||||
|
||||
@NotNull
|
||||
private ModelAndView<?> handleKeywords(Context context) throws URISyntaxException {
|
||||
if ("GET".equals(context.getMethod())) {
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", "");
|
||||
}
|
||||
else if (!"POST".equals(context.getMethod())) {
|
||||
throw new IllegalArgumentException("Invalid method");
|
||||
}
|
||||
|
||||
String textSample = context.form("textSample").value();
|
||||
|
||||
// Run sentende extration on the text as-is
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
|
||||
|
||||
// Run individual extraction logic
|
||||
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, dld);
|
||||
var titleKeywords = new TitleKeywords(dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
|
||||
// Run full extraction logic to capture positioning etc
|
||||
var extractedKeywords = new DocumentKeywordExtractor(termFrequencyDict)
|
||||
.extractKeywords(dld, new LinkTexts(), new EdgeUrl("https://www.example.com/"));
|
||||
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", textSample)
|
||||
.put("language", dld.language())
|
||||
.put("tagColors", posTagStyles(dld))
|
||||
.put("sentences", dld.sentences())
|
||||
.put("tfIdfReps", tfIdfCounts.getReps())
|
||||
.put("titleReps", titleKeywords.getReps())
|
||||
.put("nameLikeReps", nameLikeKeywords.getReps())
|
||||
.put("subjectLikeReps", subjectLikeKeywords.getReps())
|
||||
.put("artifacts", artifactKeywords.getWords())
|
||||
.put("importantWords", extractedKeywords.importantWords)
|
||||
.put("positionedWords", extractedKeywords.wordToPos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate unique colors for each POS tag, to help the UI rendering
|
||||
*/
|
||||
public static Map<Long, String> posTagStyles(DocumentLanguageData dld) {
|
||||
Map<Long, String> styles = new HashMap<>();
|
||||
|
||||
// we sort them first to ensure the most common tags are guaranteed to have
|
||||
// the largest difference between colors
|
||||
|
||||
Map<Long, Integer> counts = new HashMap<>();
|
||||
for (var sentence : dld.sentences()) {
|
||||
for (var tag : sentence.posTags) {
|
||||
counts.merge(tag, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
|
||||
List<Long> posTagsByCount = counts
|
||||
.entrySet().stream()
|
||||
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
|
||||
.map(Map.Entry::getKey)
|
||||
.toList();
|
||||
|
||||
|
||||
for (int i = 0; i < posTagsByCount.size(); i++) {
|
||||
String style = "text-" + switch (i&0x7) {
|
||||
case 0 -> "red";
|
||||
case 1 -> "green";
|
||||
case 2 -> "blue";
|
||||
case 3 -> "yellow";
|
||||
case 4 -> "purple";
|
||||
case 5 -> "cyan";
|
||||
case 6 -> "pink";
|
||||
default -> "gray";
|
||||
}+"-"+switch((i/8) & 3) {
|
||||
case 0 -> "900";
|
||||
case 3 -> "500";
|
||||
case 1 -> "750";
|
||||
case 2 -> "400";
|
||||
default -> "300";
|
||||
};
|
||||
styles.put(posTagsByCount.get(i), style);
|
||||
}
|
||||
return styles;
|
||||
}
|
||||
|
||||
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
||||
private static Path getLanguageModelsPath() {
|
||||
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
||||
.map(Path::of)
|
||||
.orElse(LANGUAGE_MODELS_DEFAULT);
|
||||
|
||||
if (!Files.isDirectory(languageModelsHome)) {
|
||||
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
||||
}
|
||||
return languageModelsHome;
|
||||
}
|
||||
private static LanguageModels getLanguageModels() {
|
||||
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("lid.176.ftz"),
|
||||
languageModelsHome.resolve("segments.bin")
|
||||
);
|
||||
}
|
||||
}
|
@@ -0,0 +1,43 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
sealed public interface LanguageConfigLocation {
|
||||
InputStream findLanguageConfiguration() throws IOException;
|
||||
|
||||
final class Auto implements LanguageConfigLocation {
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
Path filesystemPath = WmsaHome.getLangugeConfig();
|
||||
if (Files.exists(filesystemPath)) {
|
||||
return Files.newInputStream(filesystemPath, StandardOpenOption.READ);
|
||||
}
|
||||
if (Boolean.getBoolean("language.experimental")) {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
} else {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final class Experimental implements LanguageConfigLocation {
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
}
|
||||
}
|
||||
|
||||
final class Default implements LanguageConfigLocation {
|
||||
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,405 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import com.github.jfasttext.JFastText;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.pos.PosTagger;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.security.DigestInputStream;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.*;
|
||||
|
||||
@Singleton
|
||||
public class LanguageConfiguration {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageConfiguration.class);
|
||||
|
||||
private final Map<String, Path> resources = new HashMap<>();
|
||||
private final Map<String, LanguageDefinition> languages = new LinkedHashMap<>();
|
||||
private final JFastText fastTextLanguageModel = new JFastText();
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(org.jsoup.nodes.Document jsoupDoc) {
|
||||
StringBuilder sampleBuilder = new StringBuilder();
|
||||
jsoupDoc.body().traverse((node, _) -> {
|
||||
if (sampleBuilder.length() > 4096)
|
||||
return;
|
||||
if (!(node instanceof TextNode tn))
|
||||
return;
|
||||
|
||||
sampleBuilder.append(' ').append(tn.text());
|
||||
});
|
||||
return identifyLanguage(sampleBuilder.toString());
|
||||
}
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample) {
|
||||
String prediction = fastTextLanguageModel.predict(sample);
|
||||
if (null == prediction)
|
||||
return Optional.empty();
|
||||
|
||||
if (prediction.length() == "__label__??".length()) {
|
||||
String isoCode = prediction.substring("__label__".length());
|
||||
return Optional.ofNullable(getLanguage(isoCode));
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample, String fallbackIsoCode) {
|
||||
return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
|
||||
}
|
||||
|
||||
public List<LanguageDefinition> languages() {
|
||||
return new ArrayList<>(this.languages.values());
|
||||
}
|
||||
public Map<String, LanguageDefinition> languagesMap() {
|
||||
return Collections.unmodifiableMap(languages);
|
||||
}
|
||||
@Nullable
|
||||
public LanguageDefinition getLanguage(String language) {
|
||||
return languages.get(language);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public LanguageConfiguration() throws IOException, ParserConfigurationException, SAXException {
|
||||
this(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Auto());
|
||||
}
|
||||
|
||||
public LanguageConfiguration(LanguageConfigLocation languageFile) throws IOException, ParserConfigurationException, SAXException {
|
||||
this(WmsaHome.getLanguageModels(), languageFile);
|
||||
}
|
||||
|
||||
public LanguageConfiguration(LanguageModels lm, LanguageConfigLocation languageFile)
|
||||
throws IOException, ParserConfigurationException, SAXException {
|
||||
fastTextLanguageModel.loadModel(lm.fasttextLanguageModel.toString());
|
||||
|
||||
try (var languagesXmlStream = languageFile.findLanguageConfiguration()) {
|
||||
if (languagesXmlStream == null)
|
||||
throw new IllegalStateException("languages-default.xml resource not found in classpath");
|
||||
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
Document doc = builder.parse(languagesXmlStream);
|
||||
|
||||
parseResources(doc);
|
||||
parseLanguages(doc);
|
||||
}
|
||||
|
||||
logger.info("Loaded language configuration: {}", languages);
|
||||
}
|
||||
|
||||
private void parseLanguages(Document doc) {
|
||||
NodeList languageNodes = doc.getElementsByTagName("language");
|
||||
|
||||
for (int i = 0; i < languageNodes.getLength(); i++) {
|
||||
Element languageTag = (Element) languageNodes.item(i);
|
||||
|
||||
boolean disabled = "TRUE".equalsIgnoreCase(languageTag.getAttribute("disabled"));
|
||||
if (disabled)
|
||||
continue;
|
||||
|
||||
String isoCode = languageTag.getAttribute("isoCode").toLowerCase();
|
||||
String name = languageTag.getAttribute("name");
|
||||
|
||||
try {
|
||||
PosTagger posTagger = parsePosTag(languageTag, isoCode);
|
||||
Stemmer stemmer = parseStemmerTag(languageTag, posTagger, isoCode);
|
||||
KeywordHasher keywordHasher = parseHasherTag(languageTag, isoCode);
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns =
|
||||
parsePosPatterns(posTagger, languageTag, isoCode);
|
||||
UnicodeNormalization unicodeNormalization = parseUnicodeNormalization(languageTag, isoCode);
|
||||
|
||||
languages.put(isoCode,
|
||||
new LanguageDefinition(isoCode, name, stemmer, unicodeNormalization, keywordHasher, posTagger, posPatterns));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to set up language " + isoCode, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private UnicodeNormalization parseUnicodeNormalization(Element languageTag, String isoCode) {
|
||||
NodeList normalizationTags = languageTag.getElementsByTagName("unicodeNormalization");
|
||||
if (normalizationTags.getLength() == 0)
|
||||
return new UnicodeNormalization.JustNormalizeQuotes();
|
||||
Element normalizationTag = (Element) normalizationTags.item(0);
|
||||
String algorithm = normalizationTag.getAttribute("algorithm");
|
||||
|
||||
return switch(algorithm) {
|
||||
case "minimal" -> new UnicodeNormalization.JustNormalizeQuotes();
|
||||
case "e-accents" -> new UnicodeNormalization.FlattenEAccents();
|
||||
case "german" -> new UnicodeNormalization.Flattenß();
|
||||
case "maximal-latin" -> new UnicodeNormalization.FlattenAllLatin();
|
||||
default -> throw new IllegalArgumentException("Invalida algorithm " + algorithm + " on language configuration for " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
|
||||
Element languageTag, String isoCode) {
|
||||
if (null == posTagger)
|
||||
return Map.of();
|
||||
|
||||
Map<PosPatternCategory, List<PosPattern>> ret = new HashMap<>();
|
||||
NodeList ngramsElements = languageTag.getElementsByTagName("ngrams");
|
||||
|
||||
for (int i = 0; i < ngramsElements.getLength(); i++) {
|
||||
Element ngramsTag = (Element) ngramsElements.item(i);
|
||||
String type = ngramsTag.getAttribute("type");
|
||||
|
||||
PosPatternCategory category = switch(type) {
|
||||
case "name" -> PosPatternCategory.NAME;
|
||||
case "noun" -> PosPatternCategory.NOUN;
|
||||
case "keyword" -> PosPatternCategory.KEYWORD;
|
||||
case "title" -> PosPatternCategory.TITLE;
|
||||
case "subject-suffix" -> PosPatternCategory.SUBJECT_SUFFIX;
|
||||
default -> throw new IllegalArgumentException("Invalid ngrams type in " + isoCode + ", what is '" + type + "'?");
|
||||
};
|
||||
|
||||
NodeList posPatternsList = ngramsTag.getElementsByTagName("pospattern");
|
||||
for (int j = 0; j < posPatternsList.getLength(); j++) {
|
||||
Element posPatternTag = (Element) posPatternsList.item(j);
|
||||
ret.computeIfAbsent(category, (k) -> new ArrayList<>())
|
||||
.add(new PosPattern(posTagger, posPatternTag.getTextContent()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private PosTagger parsePosTag(Element languageTag, String isoCode) throws IOException {
|
||||
NodeList rdrElements = languageTag.getElementsByTagName("rdrTagger");
|
||||
if (rdrElements.getLength() < 1) {
|
||||
return null;
|
||||
}
|
||||
else if (rdrElements.getLength() > 1) {
|
||||
throw new IllegalStateException("Multiple rdr taggers defined in " + isoCode);
|
||||
}
|
||||
Element rdrElement = (Element) rdrElements.item(0);
|
||||
|
||||
String dictId = rdrElement.getAttribute("dictId");
|
||||
String rdrId = rdrElement.getAttribute("rdrId");
|
||||
|
||||
Path dictPath = resources.get(dictId);
|
||||
Path rdrPath = resources.get(rdrId);
|
||||
|
||||
if (null == dictPath)
|
||||
throw new IllegalArgumentException("language.xml: dictPath id " + dictId
|
||||
+ " does not map to a resource in " + isoCode);
|
||||
if (null == rdrPath)
|
||||
throw new IllegalArgumentException("language.xml: rdrPath id " + dictId
|
||||
+ " does not map to a resource in " + isoCode);
|
||||
|
||||
return new PosTagger(isoCode, dictPath, rdrPath);
|
||||
}
|
||||
|
||||
|
||||
private KeywordHasher parseHasherTag(Element languageElement, String isoCode) {
|
||||
NodeList keywordHasherElements = languageElement.getElementsByTagName("keywordHash");
|
||||
if (keywordHasherElements.getLength() != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"language.xml: No keywordHasher block for language element " + isoCode);
|
||||
}
|
||||
Element keywordHasheElement = (Element) keywordHasherElements.item(0);
|
||||
|
||||
String hasherName = keywordHasheElement.getAttribute("algorithm");
|
||||
|
||||
return switch (hasherName) {
|
||||
case "asciish" -> new KeywordHasher.AsciiIsh();
|
||||
case "utf8" -> new KeywordHasher.Utf8();
|
||||
default -> throw new IllegalArgumentException(
|
||||
"language.xml: Unknown keywordHash name " + hasherName + " in " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private Stemmer parseStemmerTag(Element languageElement, PosTagger posTagger, String isoCode) {
|
||||
NodeList stemmerElements = languageElement.getElementsByTagName("stemmer");
|
||||
if (stemmerElements.getLength() != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"language.xml: No stemmer block for language element " + isoCode);
|
||||
}
|
||||
Element stemmerElement = (Element) stemmerElements.item(0);
|
||||
|
||||
String stemmerName = stemmerElement.getAttribute("algorithm");
|
||||
String stemmerVariant = stemmerElement.getAttribute("variant");
|
||||
|
||||
PosPattern inclusionPattern = null;
|
||||
NodeList posPatternList = stemmerElement.getElementsByTagName("pospattern");
|
||||
if (posPatternList.getLength() >= 1) {
|
||||
Element posElement = (Element) posPatternList.item(0);
|
||||
inclusionPattern = new PosPattern(posTagger, posElement.getTextContent());
|
||||
}
|
||||
|
||||
return switch (stemmerName.toLowerCase()) {
|
||||
case "porter" -> new Stemmer.Porter(inclusionPattern);
|
||||
case "snowball" -> new Stemmer.Snowball(stemmerVariant, inclusionPattern);
|
||||
case "none" -> new Stemmer.NoOpStemmer();
|
||||
default -> throw new IllegalArgumentException(
|
||||
"language.xml: Unknown stemmer name " + stemmerName + " in " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private void parseResources(Document doc) throws IOException {
|
||||
NodeList resourceNodes = doc.getElementsByTagName("resource");
|
||||
for (int i = 0; i < resourceNodes.getLength(); i++) {
|
||||
Element resourceTag = (Element) resourceNodes.item(i);
|
||||
|
||||
String resourceId = resourceTag.getAttribute("id");
|
||||
String resourceMd5 = resourceTag.getAttribute("md5");
|
||||
Path resourcePath = WmsaHome.getDataPath().resolve(resourceTag.getAttribute("path"));
|
||||
String resourceHref = resourceTag.getAttribute("href");
|
||||
|
||||
if (!validateResource(resourcePath, resourceMd5)) {
|
||||
boolean success = false;
|
||||
try {
|
||||
success = fetchResource(resourceHref, resourcePath, resourceMd5);
|
||||
} catch (URISyntaxException | IOException ex) {
|
||||
logger.error(ex.getMessage(), ex);
|
||||
success = false;
|
||||
}
|
||||
|
||||
// It's likely if we were to just explode here, that a docker-compose restart:always
|
||||
// would put us in a
|
||||
// loop that repeatedly fails to download the same file. We'd like to avoid that by
|
||||
// stalling and
|
||||
// awaiting human intervention.
|
||||
|
||||
while (!success) {
|
||||
logger.error("Stopping to prevent restart loop");
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (resources.put(resourceId, resourcePath) != null)
|
||||
throw new IllegalStateException(
|
||||
"Resource with id " + resourceId + " already exists");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean fetchResource(String resourceUrl, Path resourcePath, String resourceMd5)
|
||||
throws IOException, URISyntaxException {
|
||||
|
||||
Path parentPath = resourcePath.getParent();
|
||||
if (!Files.isDirectory(parentPath)) {
|
||||
logger.info("Setting up directory {}", parentPath);
|
||||
Files.createDirectories(parentPath);
|
||||
}
|
||||
|
||||
logger.info("Fetching {}", resourceUrl);
|
||||
|
||||
URL url = new URI(resourceUrl).toURL();
|
||||
Path tempFile = Files.createTempFile("resource", "dat");
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
try (InputStream is = conn.getInputStream();
|
||||
OutputStream os = Files.newOutputStream(tempFile, StandardOpenOption.WRITE,
|
||||
StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
is.transferTo(os);
|
||||
os.flush();
|
||||
|
||||
String actualMd5 = getFileMD5(tempFile);
|
||||
if (!resourceMd5.isBlank() && !Objects.equals(resourceMd5, actualMd5)) {
|
||||
logger.error("Freshly downloaded resource {} does not match md5sum {}", resourceUrl,
|
||||
resourceMd5);
|
||||
return false;
|
||||
} else {
|
||||
logger.info("Downloaded resource {} to {} ** md5sum {}", resourceUrl, resourcePath,
|
||||
actualMd5);
|
||||
Files.move(tempFile, resourcePath, StandardCopyOption.REPLACE_EXISTING);
|
||||
return true;
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
logger.error("IOException", ex);
|
||||
return false;
|
||||
} finally {
|
||||
conn.disconnect();
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean validateResource(Path resourcePath, String providedMd5Sum) throws IOException {
|
||||
resourcePath = resourcePath.normalize();
|
||||
|
||||
if (!resourcePath.normalize().startsWith(WmsaHome.getDataPath()))
|
||||
throw new IllegalArgumentException(
|
||||
"Resource path has escaped $WMSA_HOME/data: " + resourcePath);
|
||||
if (!Files.exists(resourcePath)) {
|
||||
logger.info("Resource path does not exist: " + resourcePath);
|
||||
return false;
|
||||
}
|
||||
|
||||
String actualMd5 = getFileMD5(resourcePath);
|
||||
if (providedMd5Sum.isBlank()) {
|
||||
logger.info("No md5sum provided for resource path: {}, but was calculated to {}",
|
||||
resourcePath, actualMd5);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (Objects.equals(actualMd5, providedMd5Sum)) {
|
||||
return true;
|
||||
} else {
|
||||
logger.error("MD5 checksum mismatch for {} -- {}", resourcePath, providedMd5Sum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public String getFileMD5(Path filePath) {
|
||||
try (InputStream fis = Files.newInputStream(filePath)) {
|
||||
MessageDigest md = MessageDigest.getInstance("MD5");
|
||||
DigestInputStream dis = new DigestInputStream(fis, md);
|
||||
|
||||
// Read the file
|
||||
byte[] buffer = new byte[8192];
|
||||
while (dis.read(buffer) != -1) {
|
||||
// Reading updates the digest
|
||||
}
|
||||
|
||||
byte[] digest = md.digest();
|
||||
|
||||
// Convert to hex
|
||||
StringBuilder hexString = new StringBuilder();
|
||||
for (byte b : digest) {
|
||||
String hex = Integer.toHexString(0xff & b);
|
||||
if (hex.length() == 1) {
|
||||
hexString.append('0');
|
||||
}
|
||||
hexString.append(hex);
|
||||
}
|
||||
return hexString.toString();
|
||||
} catch (IOException | NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,227 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
public interface UnicodeNormalization {
|
||||
|
||||
String flattenUnicode(String s);
|
||||
|
||||
static final boolean NO_FLATTEN_UNICODE =
|
||||
Boolean.getBoolean("system.noFlattenUnicode");
|
||||
|
||||
class JustNormalizeQuotes implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenEAccents implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
for (int i = 0; i < numCp;) {
|
||||
int c = s.codePointAt(i);
|
||||
i+=Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("é".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class Flattenß implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
} else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenAllLatin implements UnicodeNormalization {
|
||||
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
// Falsehoods programmers believe about the latin alphabet ;-)
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
|
||||
sb.append('a');
|
||||
}
|
||||
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
|
||||
sb.append('b');
|
||||
}
|
||||
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
|
||||
sb.append('c');
|
||||
}
|
||||
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
|
||||
sb.append('d');
|
||||
}
|
||||
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else if ("ḟƒ".indexOf(c) >= 0) {
|
||||
sb.append('f');
|
||||
}
|
||||
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
|
||||
sb.append('i');
|
||||
}
|
||||
else if ("ĵǰɉ".indexOf(c) >= 0) {
|
||||
sb.append('j');
|
||||
}
|
||||
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
|
||||
sb.append('k');
|
||||
}
|
||||
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
|
||||
sb.append('l');
|
||||
}
|
||||
else if ("ḿṁṃ".indexOf(c) >= 0) {
|
||||
sb.append('m');
|
||||
}
|
||||
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
|
||||
sb.append('n');
|
||||
}
|
||||
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
|
||||
sb.append('o');
|
||||
}
|
||||
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
|
||||
sb.append('p');
|
||||
}
|
||||
else if ("ꝗ".indexOf(c) >= 0) {
|
||||
sb.append('q');
|
||||
}
|
||||
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
|
||||
sb.append('r');
|
||||
}
|
||||
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
|
||||
sb.append('s');
|
||||
}
|
||||
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
|
||||
sb.append('t');
|
||||
}
|
||||
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
|
||||
sb.append('u');
|
||||
}
|
||||
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
|
||||
sb.append('v');
|
||||
}
|
||||
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
|
||||
sb.append('w');
|
||||
}
|
||||
else if ("x̂ẍẋ".indexOf(c) >= 0) {
|
||||
sb.append('x');
|
||||
}
|
||||
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
|
||||
sb.append('y');
|
||||
}
|
||||
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
|
||||
sb.append('z');
|
||||
}
|
||||
else if ("Þþ".indexOf(c) >= 0) {
|
||||
sb.append("th");
|
||||
}
|
||||
else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else if (isAscii(c)) {
|
||||
sb.append((char) c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static boolean isPlainAscii(String s) {
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
if (!isAscii(c))
|
||||
return false;
|
||||
i += Character.charCount(c);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean isAscii(int c) {
|
||||
return (c & ~0x7f) == 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user