mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
405 Commits
deploy-027
...
b7d3b67a1d
Author | SHA1 | Date | |
---|---|---|---|
|
b7d3b67a1d | ||
|
d28010b7e6 | ||
|
2689bd9eaa | ||
|
f6d5d7f196 | ||
|
abf1186fa7 | ||
|
94a77ebddf | ||
|
4e2f76a477 | ||
|
4cd1834938 | ||
|
5cbbea67ed | ||
|
b688f15550 | ||
|
f55af8ef48 | ||
|
adc815e282 | ||
|
ca8455e049 | ||
|
4ea724d2cb | ||
|
40600e7297 | ||
|
7795742538 | ||
|
82d33ce69b | ||
|
e49cc5c244 | ||
|
0af389ad93 | ||
|
48791f56bd | ||
|
be83726427 | ||
|
708caa8791 | ||
|
32394f42b9 | ||
|
b8e3445ce0 | ||
|
17a78a7b7e | ||
|
5a75dd8093 | ||
|
a9713347a0 | ||
|
4694d36ed2 | ||
|
70bdd1f51e | ||
|
187b4828e6 | ||
|
93fc14dc94 | ||
|
fbfea8539b | ||
|
0929d77247 | ||
|
db8f8c1f55 | ||
|
dcb2723386 | ||
|
00c1f495f6 | ||
|
73a923983a | ||
|
e9ed0c5669 | ||
|
5b2bec6144 | ||
|
f26bb8e2b1 | ||
|
4455495dc6 | ||
|
b84d17aa51 | ||
|
9d008390ae | ||
|
a40c2a8146 | ||
|
a3416bf48e | ||
|
ee2461d9fc | ||
|
54c91a84e3 | ||
|
a6371fc54c | ||
|
8faa9a572d | ||
|
fdce940263 | ||
|
af8a13a7fb | ||
|
9e332de6b4 | ||
|
d457bb5d44 | ||
|
c661ebb619 | ||
|
53e744398a | ||
|
1d71baf3e5 | ||
|
bb5fc0f348 | ||
|
c8f112d040 | ||
|
ae31bc8498 | ||
|
da5046c3bf | ||
|
f67257baf2 | ||
|
924fb05661 | ||
|
c231a82062 | ||
|
2c1082d7f0 | ||
|
06947bd026 | ||
|
519aebd7c6 | ||
|
42cc27586e | ||
|
360881fafd | ||
|
4c6fdf6ebe | ||
|
554de21f68 | ||
|
00194acbfe | ||
|
97dabcefaa | ||
|
cc790644d4 | ||
|
8f893ee6c0 | ||
|
938721b793 | ||
|
f68bcefc75 | ||
|
164a646af6 | ||
|
0cfd759f85 | ||
|
b53002200c | ||
|
78246b9a63 | ||
|
b552e79927 | ||
|
bffc159486 | ||
|
b8000721bd | ||
|
2ee0b0e420 | ||
|
1432fc87d7 | ||
|
ec5f32b1d8 | ||
|
edd453531e | ||
|
096496ada1 | ||
|
8ca6209260 | ||
|
673c65d3c9 | ||
|
acb9ec7b15 | ||
|
47079e05db | ||
|
c93056e77f | ||
|
6f7530e807 | ||
|
87ce4a1b52 | ||
|
52194cbe7a | ||
|
fd1ac03c78 | ||
|
5e5b86efb4 | ||
|
f332ec6191 | ||
|
c25c1af437 | ||
|
eb0c911b45 | ||
|
1979870ce4 | ||
|
0ba2ea38e1 | ||
|
d6cfbceeea | ||
|
e369d200cc | ||
|
946d64c8da | ||
|
42f043a60f | ||
|
b46f2e1407 | ||
|
18aa1b9764 | ||
|
2f3950e0d5 | ||
|
61d803869e | ||
|
df6434d177 | ||
|
59519ed7c4 | ||
|
874fc2d250 | ||
|
69e8ec0eef | ||
|
a7eb5f54e6 | ||
|
b29ba3e228 | ||
|
5fa5029c60 | ||
|
4257f60f00 | ||
|
ce221d3a0e | ||
|
f0741142a3 | ||
|
0899e4d895 | ||
|
bbf7c5a1cb | ||
|
686a40e69b | ||
|
8af254f44f | ||
|
2c21bd9287 | ||
|
f9645e2f00 | ||
|
81e311b558 | ||
|
507c09146a | ||
|
f682425594 | ||
|
de67006c4f | ||
|
eea32bb7b4 | ||
|
e976940a4e | ||
|
b564b33028 | ||
|
1cca16a58e | ||
|
70b4ed6d81 | ||
|
45dc6412c1 | ||
|
b3b95edcb5 | ||
|
338d300e1a | ||
|
fa685bf1f4 | ||
|
d79a3e2b2a | ||
|
854382b2be | ||
|
8710adbc2a | ||
|
acdf7b4785 | ||
|
b5d27c1406 | ||
|
55eb7dc116 | ||
|
f0e8bc8baf | ||
|
91a6ad2337 | ||
|
9a182b9ddb | ||
|
fefbcf15ce | ||
|
9a789bf62d | ||
|
0525303b68 | ||
|
6953d65de5 | ||
|
a7a18ced2e | ||
|
7c94c941b2 | ||
|
ea99b62356 | ||
|
3dc21d34d8 | ||
|
51912e0176 | ||
|
de1b4d5372 | ||
|
50ac926060 | ||
|
d711ee75b5 | ||
|
291ff0c4de | ||
|
2fd2710355 | ||
|
e3b957063d | ||
|
aee262e5f6 | ||
|
4a98a3c711 | ||
|
68f52ca350 | ||
|
2a2d951c2f | ||
|
379a1be074 | ||
|
827aadafcd | ||
|
aa7679d6ce | ||
|
6fe6de766d | ||
|
4245ac4c07 | ||
|
1c49a0f5ad | ||
|
9a6e5f646d | ||
|
fa92994a31 | ||
|
bc49406881 | ||
|
90325be447 | ||
|
dc89587af3 | ||
|
7b552afd6b | ||
|
73557edc67 | ||
|
83919e448a | ||
|
6f5b75b84d | ||
|
db315e2813 | ||
|
e9977e08b7 | ||
|
1df3757e5f | ||
|
ca283f9684 | ||
|
85360e61b2 | ||
|
e2ccff21bc | ||
|
c5b5b0c699 | ||
|
9a65946e22 | ||
|
1d2ab21e27 | ||
|
0610cc19ad | ||
|
a676306a7f | ||
|
8d68cd14fb | ||
|
4773c5a52b | ||
|
74bd562ae4 | ||
|
c9751287b0 | ||
|
5da24e3fc4 | ||
|
20a4e86eec | ||
|
477a184948 | ||
|
8940ce99db | ||
|
0ac0fa4dca | ||
|
942f15ef14 | ||
|
f668f33d5b | ||
|
6789975cd2 | ||
|
c3ba608776 | ||
|
733d2687fe | ||
|
f6daac8ed0 | ||
|
c2eeee4a06 | ||
|
3b0c701df4 | ||
|
c6fb2db43b | ||
|
9bc8fe05ae | ||
|
440ffcf6f8 | ||
|
b07709cc72 | ||
|
9a6acdcbe0 | ||
|
23b9b0bf1b | ||
|
749c8ed954 | ||
|
9f4b6939ca | ||
|
1d08e44e8d | ||
|
fc2e156e78 | ||
|
5e68a89e9f | ||
|
d380661307 | ||
|
cccdf5c329 | ||
|
f085b4ea12 | ||
|
e208f7d3ba | ||
|
b577085cb2 | ||
|
b9240476f6 | ||
|
8f50f86d0b | ||
|
e3b7ead7a9 | ||
|
9a845ba604 | ||
|
b9381f1603 | ||
|
6a60127267 | ||
|
e8ffcfbb19 | ||
|
caf0850f81 | ||
|
62e3bb675e | ||
|
4dc3e7da7a | ||
|
92b09883ec | ||
|
87082b4ef8 | ||
|
84d3f6087f | ||
|
f93ba371a5 | ||
|
5eec27c68d | ||
|
ab01576f91 | ||
|
054e5ccf44 | ||
|
4351ea5128 | ||
|
49cfa3a5e9 | ||
|
683854b23f | ||
|
e880fa8945 | ||
|
2482dc572e | ||
|
4589f11898 | ||
|
e43b6e610b | ||
|
4772117a1f | ||
|
3fc7ea521c | ||
|
4372f5af03 | ||
|
4ad89b6c75 | ||
|
ad0519e031 | ||
|
596ece1230 | ||
|
07b6e1585b | ||
|
cb5e2778eb | ||
|
8f5ea7896c | ||
|
76c398e0b1 | ||
|
4a94f04a8d | ||
|
df72f670d4 | ||
|
eaa22c2f5a | ||
|
7be173aeca | ||
|
36685bdca7 | ||
|
ad04057609 | ||
|
eb76ae22e2 | ||
|
4b858ab341 | ||
|
c6e3c8aa3b | ||
|
9128d3907c | ||
|
4ef16d13d4 | ||
|
838a5626ec | ||
|
6b426209c7 | ||
|
452b5731d9 | ||
|
c91cf49630 | ||
|
8503030f18 | ||
|
744f7d3ef7 | ||
|
215e12afe9 | ||
|
2716bce918 | ||
|
caf2e6fbb7 | ||
|
233f0acfb1 | ||
|
e3a4ff02e9 | ||
|
c786283ae1 | ||
|
a3f65ac0e0 | ||
|
aba1a32af0 | ||
|
c9c442345b | ||
|
2e126ba30e | ||
|
2087985f49 | ||
|
2b13ebd18b | ||
|
6d92c125fe | ||
|
f638cfa39a | ||
|
89447c12af | ||
|
c71fc46f04 | ||
|
f96874d828 | ||
|
583a84d5a0 | ||
|
f65b946448 | ||
|
3682815855 | ||
|
3a94357660 | ||
|
673b0d3de1 | ||
|
ea942bc664 | ||
|
7ed5083c54 | ||
|
08bb2c097b | ||
|
495fb325be | ||
|
05c25bbaec | ||
|
2a028b84f3 | ||
|
a091a23623 | ||
|
e8897acb45 | ||
|
b89ffcf2be | ||
|
dbcc9055b0 | ||
|
d9740557f4 | ||
|
0d6cd015fd | ||
|
c6034efcc8 | ||
|
76068014ad | ||
|
1c3ed67127 | ||
|
fc0cb6bd9a | ||
|
c2601bac78 | ||
|
f5641b72e9 | ||
|
36efe2e219 | ||
|
983fe3829e | ||
|
668c87aa86 | ||
|
9d3f9adb05 | ||
|
a43a1773f1 | ||
|
1e7a3a3c4f | ||
|
62b696b1c3 | ||
|
f1a900f383 | ||
|
700364b86d | ||
|
7e725ddaed | ||
|
120209e138 | ||
|
a771a5b6ce | ||
|
dac5b54128 | ||
|
6cfb143c15 | ||
|
23c818281b | ||
|
8aad253cf6 | ||
|
556d7af9dc | ||
|
b7a5219ed3 | ||
|
a23ec521fe | ||
|
fff3babc6d | ||
|
b2bfb8217c | ||
|
3b2ac414dc | ||
|
0ba6515a01 | ||
|
16c6b0f151 | ||
|
e998692900 | ||
|
eeb1695a87 | ||
|
a0ab910940 | ||
|
b9f31048d7 | ||
|
12c304289a | ||
|
6ee01dabea | ||
|
1b80e282a7 | ||
|
a65d18f1d1 | ||
|
90a1ff220b | ||
|
d6c7092335 | ||
|
b716333856 | ||
|
b504b8482c | ||
|
80da1e9ad1 | ||
|
d3f744a441 | ||
|
60fb539875 | ||
|
7f5094fedf | ||
|
45066636a5 | ||
|
e2d6898c51 | ||
|
58ef767b94 | ||
|
f9f268c67a | ||
|
f44c2bdee9 | ||
|
6fdf477c18 | ||
|
6b6e455e3f | ||
|
a3a126540c | ||
|
842b19da40 | ||
|
2a30e93bf0 | ||
|
3d998f12c0 | ||
|
cbccc2ac23 | ||
|
2cfc23f9b7 | ||
|
88fe394cdb | ||
|
f30fcebd4f | ||
|
5d885927b4 | ||
|
7622c8358e | ||
|
69ed9aef47 | ||
|
4c78c223da | ||
|
71b9935dd6 | ||
|
ad38f2fd83 | ||
|
9c47388846 | ||
|
d9ab10e33f | ||
|
e13ea7f42b | ||
|
f38daeb036 | ||
|
6e214293e5 | ||
|
52582a6d7d | ||
|
ec0e39ad32 | ||
|
6a15aee4b0 | ||
|
bd5111e8a2 | ||
|
1ecbeb0272 | ||
|
b91354925d | ||
|
3f85c9c154 | ||
|
390f053406 | ||
|
89e03d6914 | ||
|
14e0bc9f26 | ||
|
7065b46c6f | ||
|
0372190c90 | ||
|
ceaf32fb90 | ||
|
b03c43224c | ||
|
b57db01415 | ||
|
ce7d522608 | ||
|
18649b6ee9 | ||
|
f6417aef1a | ||
|
2aa7e376b0 | ||
|
f33bc44860 | ||
|
a2826efd44 |
9
.gitignore
vendored
9
.gitignore
vendored
@@ -7,4 +7,11 @@ build/
|
||||
lombok.config
|
||||
Dockerfile
|
||||
run
|
||||
jte-classes
|
||||
jte-classes
|
||||
.classpath
|
||||
.project
|
||||
.settings
|
||||
.factorypath
|
||||
bin/
|
||||
*.log
|
||||
*.hprof
|
||||
|
@@ -48,10 +48,6 @@ filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Show favicons next to search results (COMPLETED 2025-03)
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
12
build.gradle
12
build.gradle
@@ -1,11 +1,12 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
id "me.champeau.jmh" version "0.7.3"
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||
id 'com.adarshr.test-logger' version '4.0.0'
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
@@ -31,7 +32,10 @@ subprojects.forEach {it ->
|
||||
jvmArgs += ['--enable-preview']
|
||||
}
|
||||
it.tasks.withType(Test).configureEach {
|
||||
jvmArgs += ['--enable-preview']
|
||||
jvmArgs += ['--enable-preview',
|
||||
'--enable-native-access=ALL-UNNAMED',
|
||||
'--sun-misc-unsafe-memory-access=allow',
|
||||
'-Dsystem.uringQueueCount=1']
|
||||
}
|
||||
|
||||
// Enable reproducible builds for the entire project
|
||||
@@ -43,8 +47,8 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion = 24
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||
jvmVersion = 25
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:25'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.5'
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.mockito
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
|
@@ -114,4 +114,7 @@ public class WmsaHome {
|
||||
}
|
||||
|
||||
|
||||
public static Path getLangugeConfig() {
|
||||
return getHomePath().resolve("conf/languages.xml");
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,141 @@
|
||||
package nu.marginalia.proxy;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Configuration for SOCKS proxy settings used by crawlers to distribute IP footprint.
|
||||
*/
|
||||
public class SocksProxyConfiguration {
|
||||
|
||||
private final boolean enabled;
|
||||
private final List<SocksProxy> proxies;
|
||||
private final ProxySelectionStrategy strategy;
|
||||
|
||||
public SocksProxyConfiguration() {
|
||||
this.enabled = Boolean.parseBoolean(System.getProperty("crawler.socksProxy.enabled", "false"));
|
||||
this.strategy = ProxySelectionStrategy.valueOf(
|
||||
System.getProperty("crawler.socksProxy.strategy", "ROUND_ROBIN")
|
||||
);
|
||||
this.proxies = parseProxies();
|
||||
}
|
||||
|
||||
private List<SocksProxy> parseProxies() {
|
||||
String proxyList = System.getProperty("crawler.socksProxy.list", "");
|
||||
if (proxyList.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
return Arrays.stream(proxyList.split(","))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.map(this::parseProxy)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private SocksProxy parseProxy(String proxyString) {
|
||||
try {
|
||||
// Expected format: "host:port" or "host:port:username:password"
|
||||
String[] parts = proxyString.split(":");
|
||||
if (parts.length < 2) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String host = parts[0];
|
||||
int port = Integer.parseInt(parts[1]);
|
||||
|
||||
if (parts.length >= 4) {
|
||||
String username = parts[2];
|
||||
String password = parts[3];
|
||||
return new SocksProxy(host, port, username, password);
|
||||
} else {
|
||||
return new SocksProxy(host, port);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEnabled() {
|
||||
return enabled && !proxies.isEmpty();
|
||||
}
|
||||
|
||||
public List<SocksProxy> getProxies() {
|
||||
return proxies;
|
||||
}
|
||||
|
||||
public ProxySelectionStrategy getStrategy() {
|
||||
return strategy;
|
||||
}
|
||||
|
||||
public enum ProxySelectionStrategy {
|
||||
ROUND_ROBIN,
|
||||
RANDOM
|
||||
}
|
||||
|
||||
public static class SocksProxy {
|
||||
private final String host;
|
||||
private final int port;
|
||||
private final String username;
|
||||
private final String password;
|
||||
|
||||
public SocksProxy(String host, int port) {
|
||||
this(host, port, null, null);
|
||||
}
|
||||
|
||||
public SocksProxy(String host, int port, String username, String password) {
|
||||
this.host = host;
|
||||
this.port = port;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
public String getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public int getPort() {
|
||||
return port;
|
||||
}
|
||||
|
||||
public String getUsername() {
|
||||
return username;
|
||||
}
|
||||
|
||||
public String getPassword() {
|
||||
return password;
|
||||
}
|
||||
|
||||
public boolean hasAuthentication() {
|
||||
return username != null && password != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (hasAuthentication()) {
|
||||
return String.format("%s:%d (auth: %s)", host, port, username);
|
||||
} else {
|
||||
return String.format("%s:%d", host, port);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
SocksProxy that = (SocksProxy) o;
|
||||
return port == that.port &&
|
||||
Objects.equals(host, that.host) &&
|
||||
Objects.equals(username, that.username) &&
|
||||
Objects.equals(password, that.password);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(host, port, username, password);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.proxy;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* Manages SOCKS proxy selection and rotation for crawler requests.
|
||||
*/
|
||||
public class SocksProxyManager {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SocksProxyManager.class);
|
||||
|
||||
private final SocksProxyConfiguration config;
|
||||
private final AtomicInteger roundRobinIndex = new AtomicInteger(0);
|
||||
|
||||
public SocksProxyManager(SocksProxyConfiguration config) {
|
||||
this.config = config;
|
||||
|
||||
if (config.isEnabled()) {
|
||||
logger.info("SOCKS proxy support enabled with {} proxies using {} strategy",
|
||||
config.getProxies().size(), config.getStrategy());
|
||||
for (SocksProxyConfiguration.SocksProxy proxy : config.getProxies()) {
|
||||
logger.info(" - {}", proxy);
|
||||
}
|
||||
} else {
|
||||
logger.info("SOCKS proxy support disabled");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects the next proxy to use based on the configured strategy.
|
||||
*/
|
||||
@Nonnull
|
||||
public SocksProxyConfiguration.SocksProxy selectProxy() {
|
||||
if (!config.isEnabled()) {
|
||||
throw new IllegalStateException("Proxies not configured");
|
||||
}
|
||||
|
||||
List<SocksProxyConfiguration.SocksProxy> proxies = config.getProxies();
|
||||
if (proxies.isEmpty()) {
|
||||
throw new IllegalStateException("Proxies not configured");
|
||||
}
|
||||
|
||||
SocksProxyConfiguration.SocksProxy selectedProxy;
|
||||
switch (config.getStrategy()) {
|
||||
case ROUND_ROBIN:
|
||||
int index = roundRobinIndex.getAndIncrement() % proxies.size();
|
||||
selectedProxy = proxies.get(index);
|
||||
break;
|
||||
case RANDOM:
|
||||
int randomIndex = ThreadLocalRandom.current().nextInt(proxies.size());
|
||||
selectedProxy = proxies.get(randomIndex);
|
||||
break;
|
||||
default:
|
||||
selectedProxy = proxies.get(0);
|
||||
break;
|
||||
}
|
||||
|
||||
return selectedProxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current proxy configuration.
|
||||
*/
|
||||
public SocksProxyConfiguration getConfiguration() {
|
||||
return config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if proxy support is enabled and proxies are available.
|
||||
*/
|
||||
public boolean isProxyEnabled() {
|
||||
return config.isEnabled() && !config.getProxies().isEmpty();
|
||||
}
|
||||
}
|
@@ -16,7 +16,7 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
||||
<Filters>
|
||||
@@ -28,7 +28,7 @@
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -38,7 +38,7 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -56,7 +56,9 @@
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="LogToFileService"/>
|
||||
<AppenderRef ref="LogToFileCrawler"/>
|
||||
<AppenderRef ref="LogToFileConverter"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -50,7 +50,7 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n</Pattern>
|
||||
@@ -64,7 +64,7 @@
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -74,7 +74,7 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -95,7 +95,9 @@
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="LogToFileService"/>
|
||||
<AppenderRef ref="LogToFileConverer"/>
|
||||
<AppenderRef ref="LogToFileCrawler"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -6,7 +6,6 @@ import com.google.inject.name.Named;
|
||||
import gnu.trove.list.TLongList;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -14,7 +13,6 @@ import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
@@ -104,7 +102,7 @@ public class DocumentDbReader {
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
FROM DOCUMENT WHERE ID = ?
|
||||
""")) {
|
||||
for (int i = 0; i < ids.size(); i++) {
|
||||
@@ -118,6 +116,7 @@ public class DocumentDbReader {
|
||||
url,
|
||||
rs.getString("TITLE"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
rs.getString("LANGUAGE"),
|
||||
rs.getDouble("QUALITY"),
|
||||
rs.getString("FORMAT"),
|
||||
rs.getInt("FEATURES"),
|
||||
|
@@ -41,8 +41,8 @@ public class DocumentDbWriter {
|
||||
public void add(List<DocdbUrlDetail> docdbUrlDetail) throws SQLException {
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
|
||||
int i = 0;
|
||||
@@ -54,15 +54,16 @@ public class DocumentDbWriter {
|
||||
|
||||
stmt.setString(3, document.title());
|
||||
stmt.setString(4, document.description());
|
||||
stmt.setInt(5, document.wordsTotal());
|
||||
stmt.setString(6, document.format());
|
||||
stmt.setInt(7, document.features());
|
||||
stmt.setLong(8, document.dataHash());
|
||||
stmt.setDouble(9, document.urlQuality());
|
||||
stmt.setString(5, document.language());
|
||||
stmt.setInt(6, document.wordsTotal());
|
||||
stmt.setString(7, document.format());
|
||||
stmt.setInt(8, document.features());
|
||||
stmt.setLong(9, document.dataHash());
|
||||
stmt.setDouble(10, document.urlQuality());
|
||||
if (document.pubYear() == null) {
|
||||
stmt.setInt(10, 0);
|
||||
stmt.setInt(11, 0);
|
||||
} else {
|
||||
stmt.setInt(10, document.pubYear());
|
||||
stmt.setInt(11, document.pubYear());
|
||||
}
|
||||
|
||||
stmt.addBatch();
|
||||
|
@@ -6,6 +6,7 @@ public record DocdbUrlDetail(long urlId,
|
||||
EdgeUrl url,
|
||||
String title,
|
||||
String description,
|
||||
String language,
|
||||
double urlQuality,
|
||||
String format,
|
||||
int features,
|
||||
|
@@ -6,6 +6,7 @@ CREATE TABLE DOCUMENT (
|
||||
STATE INT,
|
||||
TITLE TEXT NOT NULL,
|
||||
DESCRIPTION TEXT NOT NULL,
|
||||
LANGUAGE TEXT NOT NULL,
|
||||
|
||||
WORDS_TOTAL INTEGER NOT NULL,
|
||||
FORMAT TEXT NOT NULL,
|
||||
|
@@ -23,6 +23,7 @@ public class DocumentDbWriterTest {
|
||||
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
|
||||
"Test",
|
||||
"This is a test",
|
||||
"en",
|
||||
-4.,
|
||||
"XHTML",
|
||||
5,
|
||||
|
@@ -1,13 +1,12 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class EdgeDomain implements Serializable {
|
||||
public class EdgeDomain {
|
||||
|
||||
@Nonnull
|
||||
public final String subDomain;
|
||||
|
@@ -4,13 +4,12 @@ import nu.marginalia.util.QueryParams;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.Serializable;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
public class EdgeUrl implements Serializable {
|
||||
public class EdgeUrl {
|
||||
public final String proto;
|
||||
public final EdgeDomain domain;
|
||||
public final Integer port;
|
||||
|
@@ -5,13 +5,15 @@ import java.util.Collection;
|
||||
public enum HtmlFeature {
|
||||
// Note, the first 32 of these features are bit encoded in the database
|
||||
// so be sure to keep anything that's potentially important toward the top
|
||||
// of the list
|
||||
// of the list; but adding new values will shift the encoded values and break
|
||||
// binary compatibility! Scroll down for a marker where you should add new values
|
||||
// if they need to be accessible from IndexResultScoreCalculator!
|
||||
|
||||
MEDIA( "special:media"),
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING("special:tracking"),
|
||||
TRACKING_ADTECH("special:ads"), // We'll call this ads for now
|
||||
TRACKING_ADTECH("special:adtech"),
|
||||
|
||||
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
||||
LONG_URL("special:longurl"),
|
||||
@@ -30,6 +32,15 @@ public enum HtmlFeature {
|
||||
|
||||
PDF("format:pdf"),
|
||||
|
||||
POPOVER("special:popover"),
|
||||
CONSENT("special:consent"),
|
||||
SHORT_DOCUMENT("special:shorty"),
|
||||
THIRD_PARTY_REQUESTS("special:3pr"),
|
||||
|
||||
// Here! It is generally safe to add additional values here without
|
||||
// disrupting the encoded values used by the DocumentValuator
|
||||
// class in the index!
|
||||
|
||||
/** For fingerprinting and ranking */
|
||||
OPENGRAPH("special:opengraph"),
|
||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||
@@ -67,6 +78,7 @@ public enum HtmlFeature {
|
||||
|
||||
S3_FEATURE("special:s3"),
|
||||
|
||||
MISSING_DOM_SAMPLE("special:nosample"),
|
||||
UNKNOWN("special:uncategorized");
|
||||
|
||||
|
||||
@@ -83,16 +95,24 @@ public enum HtmlFeature {
|
||||
public static int encode(Collection<HtmlFeature> featuresAll) {
|
||||
int ret = 0;
|
||||
for (var feature : featuresAll) {
|
||||
if (feature.ordinal() >= 32) continue;
|
||||
|
||||
ret |= (1 << (feature.ordinal()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean hasFeature(int value, HtmlFeature feature) {
|
||||
return (value & (1<< feature.ordinal())) != 0;
|
||||
int ord = feature.ordinal();
|
||||
if (ord >= 32) return false;
|
||||
|
||||
return (value & (1<<ord)) != 0;
|
||||
}
|
||||
|
||||
public int getFeatureBit() {
|
||||
return (1<< ordinal());
|
||||
int ord = ordinal();
|
||||
if (ord >= 32) return 0;
|
||||
|
||||
return (1<<ord);
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.model.idx;
|
||||
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
|
||||
@@ -28,7 +27,6 @@ public record DocumentMetadata(int avgSentLength,
|
||||
int sets,
|
||||
int quality,
|
||||
byte flags)
|
||||
implements Serializable
|
||||
{
|
||||
|
||||
public String toString() {
|
||||
|
@@ -7,7 +7,6 @@ public enum ServiceId {
|
||||
Search("search-service"),
|
||||
Index("index-service"),
|
||||
Query("query-service"),
|
||||
Executor("executor-service"),
|
||||
|
||||
Control("control-service"),
|
||||
|
||||
|
@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.Function;
|
||||
|
||||
@Singleton
|
||||
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
|
||||
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
private static final Executor executor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
@Inject
|
||||
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.service.client;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import io.grpc.ManagedChannel;
|
||||
import io.grpc.StatusRuntimeException;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@@ -206,6 +207,11 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
|
||||
for (var e : exceptions) {
|
||||
if (e instanceof StatusRuntimeException se) {
|
||||
throw se; // Re-throw SRE as-is
|
||||
}
|
||||
|
||||
// If there are other exceptions, log them
|
||||
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
|
||||
}
|
||||
|
||||
|
@@ -1,9 +1,9 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.grpc.Server;
|
||||
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
|
||||
import io.grpc.netty.shaded.io.netty.channel.nio.NioEventLoopGroup;
|
||||
import io.grpc.netty.shaded.io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||
import io.grpc.netty.NettyServerBuilder;
|
||||
import io.netty.channel.nio.NioEventLoopGroup;
|
||||
import io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class GrpcServer {
|
||||
private final Server server;
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
public GrpcServer(ServiceConfiguration config,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
ServicePartition partition,
|
||||
@@ -26,13 +31,19 @@ public class GrpcServer {
|
||||
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
|
||||
|
||||
// Start the gRPC server
|
||||
|
||||
ExecutorService workExecutor = useLoom ?
|
||||
Executors.newVirtualThreadPerTaskExecutor() :
|
||||
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
|
||||
|
||||
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
|
||||
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
|
||||
.executor(workExecutor)
|
||||
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
|
||||
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
|
||||
.channelType(NioServerSocketChannel.class);
|
||||
|
||||
for (var grpcService : grpcServices) {
|
||||
|
||||
if (!grpcService.shouldRegisterService()) {
|
||||
continue;
|
||||
}
|
||||
|
@@ -125,8 +125,7 @@ public class JoobyService {
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
options.setWorkerThreads(Math.min(16, options.getWorkerThreads()));
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -66,7 +66,7 @@ public class NodeStatusWatcher {
|
||||
fileStorageService.createStorageBase("Crawl Data", Path.of("/storage"), nodeId, FileStorageBaseType.STORAGE);
|
||||
fileStorageService.createStorageBase("Work Area", Path.of("/work"), nodeId, FileStorageBaseType.WORK);
|
||||
|
||||
persistence.sendNewMessage("executor-service:"+nodeId,
|
||||
persistence.sendNewMessage("index-service:"+nodeId,
|
||||
null,
|
||||
null,
|
||||
"FIRST-BOOT",
|
||||
|
@@ -189,7 +189,7 @@ public class ExecutorClient {
|
||||
String uriPath = "/transfer/file/" + fileStorage.id();
|
||||
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
|
||||
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
|
||||
if (endpoints.isEmpty()) {
|
||||
throw new RuntimeException("No endpoints for node " + fileStorage.node());
|
||||
}
|
||||
|
@@ -22,7 +22,6 @@ dependencies {
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:new-domain-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
@@ -34,7 +33,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:functions:language-processing')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
@@ -2,9 +2,8 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
|
||||
@Singleton
|
||||
public class ExecutionInit {
|
||||
@@ -22,5 +21,8 @@ public class ExecutionInit {
|
||||
actorControlService.start(ExecutorActor.PROC_CRAWLER_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_INDEX_CONSTRUCTOR_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_LOADER_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER);
|
||||
actorControlService.stop(ExecutorActor.PROC_NDP_SPAWNER);
|
||||
actorControlService.stop(ExecutorActor.PROC_PING_SPAWNER);
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -36,7 +37,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +53,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,7 +67,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,7 +81,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +99,7 @@ public class ExecutorCrawlGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -38,7 +39,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,7 +58,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,7 +74,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +88,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,7 +100,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,14 +115,14 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
responseObserver.onError(new IllegalArgumentException("Export all atags is only available on node 1"));
|
||||
responseObserver.onError(Status.UNAVAILABLE.withDescription("Export all atags is only available on node 1").asRuntimeException());
|
||||
}
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
||||
@@ -131,7 +132,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,7 +146,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,7 +160,7 @@ public class ExecutorExportGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.actor.ActorApi;
|
||||
@@ -58,7 +59,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +71,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -82,7 +83,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,7 +97,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,7 +113,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,7 +129,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -203,7 +204,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,7 +230,7 @@ public class ExecutorGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,7 +277,7 @@ public class ExecutorGrpcService
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to update nsfw filters", e);
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
@@ -33,7 +34,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,7 +49,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,7 +64,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,7 +79,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +94,7 @@ public class ExecutorSideloadGrpcService
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -5,7 +5,6 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.ConverterMain;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ndp.NdpMain;
|
||||
@@ -57,7 +56,7 @@ public class ProcessSpawnerService {
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
|
||||
NDP(NdpMain.class),
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
@@ -66,6 +65,9 @@ public class ProcessSpawnerService {
|
||||
ProcessId(Class<? extends ProcessMainClass> mainClass) {
|
||||
this.mainClass = mainClass.getName();
|
||||
}
|
||||
ProcessId(String mainClassFullName) {
|
||||
this.mainClass = mainClassFullName;
|
||||
}
|
||||
|
||||
List<String> envOpts() {
|
||||
String variable = switch (this) {
|
||||
@@ -118,6 +120,17 @@ public class ProcessSpawnerService {
|
||||
args.add("-Dsystem.serviceNode=" + System.getProperty("system.serviceNode"));
|
||||
}
|
||||
|
||||
// Add SOCKS proxy properties for crawler processes
|
||||
if (System.getProperty("crawler.socksProxy.enabled") != null) {
|
||||
args.add("-Dcrawler.socksProxy.enabled=" + System.getProperty("crawler.socksProxy.enabled"));
|
||||
}
|
||||
if (System.getProperty("crawler.socksProxy.list") != null) {
|
||||
args.add("-Dcrawler.socksProxy.list=" + System.getProperty("crawler.socksProxy.list"));
|
||||
}
|
||||
if (System.getProperty("crawler.socksProxy.strategy") != null) {
|
||||
args.add("-Dcrawler.socksProxy.strategy=" + System.getProperty("crawler.socksProxy.strategy"));
|
||||
}
|
||||
|
||||
if (Boolean.getBoolean("system.profile")) {
|
||||
// add jfr options
|
||||
args.add("-XX:+FlightRecorder");
|
||||
|
@@ -5,6 +5,7 @@ import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.linkdb.LinkdbFileNames;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@@ -13,18 +14,18 @@ import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Map;
|
||||
|
||||
public class BackupService {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final ServiceHeartbeat serviceHeartbeat;
|
||||
|
||||
public enum BackupHeartbeatSteps {
|
||||
@@ -36,8 +37,10 @@ public class BackupService {
|
||||
|
||||
@Inject
|
||||
public BackupService(FileStorageService storageService,
|
||||
LanguageConfiguration languageConfiguration,
|
||||
ServiceHeartbeat serviceHeartbeat) {
|
||||
this.storageService = storageService;
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
}
|
||||
|
||||
@@ -98,22 +101,25 @@ public class BackupService {
|
||||
}
|
||||
|
||||
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
|
||||
{
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in input storage");
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException {
|
||||
Map<String, IndexJournal> journals = IndexJournal.findJournals(inputStorage, languageConfiguration.languages());
|
||||
for (IndexJournal journal : journals.values()) {
|
||||
FileUtils.copyDirectory(journal.journalDir().toFile(), backupStorage.resolve(journal.journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in backup");
|
||||
Map<String, IndexJournal> journals = IndexJournal.findJournals(backupStorage, languageConfiguration.languages());
|
||||
for (IndexJournal journal : journals.values()) {
|
||||
var journalFileName = journal.journalDir().getFileName();
|
||||
|
||||
// Ensure we delete any previous journal junk
|
||||
if (Files.exists(destStorage.resolve(journalFileName))) {
|
||||
FileUtils.deleteDirectory(destStorage.resolve(journalFileName).toFile());
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journalFileName).toFile(), destStorage.toFile());
|
||||
}
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
|
||||
}
|
||||
|
||||
private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.executor;
|
||||
package nu.marginalia.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.storage.FileStorageService;
|
@@ -1,5 +1,5 @@
|
||||
The execution subsystem is responsible for the execution of long running tasks on each
|
||||
index node. It lives in the [executor-service](../services-core/executor-service) module.
|
||||
index node. It lives in the [index-service](../services-core/index-service) module.
|
||||
|
||||
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
|
||||
which permits program state to survive crashes and reboots.
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.executor;
|
||||
package nu.marginalia.svc;
|
||||
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
@Singleton
|
||||
public class DomainInfoClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public DomainInfoClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -1,8 +1,7 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id 'jvm-test-suite'
|
||||
id 'gg.jte.gradle' version '3.1.15'
|
||||
}
|
||||
|
||||
java {
|
||||
@@ -14,18 +13,18 @@ java {
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation libs.notnull
|
||||
implementation libs.bundles.jooby
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
@@ -42,3 +41,9 @@ dependencies {
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
jte {
|
||||
sourceDirectory = file('resources/ltt/jte').toPath()
|
||||
targetDirectory = file('build/classes/jte-precompiled').toPath()
|
||||
generate()
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic.dom;
|
||||
package nu.marginalia.dom;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic.dom;
|
||||
package nu.marginalia.dom;
|
||||
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
@@ -16,8 +16,6 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
private final TermFrequencyDict dict;
|
||||
|
||||
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
@@ -37,35 +35,54 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) {
|
||||
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
if (dld.language().hasPosParsing()) {
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper();
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, dld);
|
||||
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
var titleKeywords = new TitleKeywords(dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
|
||||
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
|
||||
|
||||
return wordsBuilder;
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
|
||||
return wordsBuilder;
|
||||
}
|
||||
else {
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper();
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
return wordsBuilder;
|
||||
}
|
||||
}
|
||||
|
||||
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {
|
@@ -3,7 +3,9 @@ package nu.marginalia.keyword;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@@ -17,8 +19,6 @@ import static java.lang.Math.sqrt;
|
||||
*/
|
||||
public class DocumentPositionMapper {
|
||||
|
||||
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
|
||||
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData dld,
|
||||
@@ -38,12 +38,14 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
|
||||
|
||||
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||
public int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData dld)
|
||||
|
||||
{
|
||||
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
List<SpanRecorder> spanRecorders = new ArrayList<>();
|
||||
for (var htmlTag : HtmlTag.includedTags) {
|
||||
if (!htmlTag.exclude) {
|
||||
@@ -80,7 +82,7 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
for (var names : languageDefinition.matchGrammarPattern(sent, PosPatternCategory.NAME)) {
|
||||
WordRep rep = new WordRep(sent, names);
|
||||
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
|
||||
@@ -161,11 +163,15 @@ public class DocumentPositionMapper {
|
||||
|
||||
int i = 0;
|
||||
|
||||
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
for (int run = 0; run < 15 && i < s.length(); run++) {
|
||||
int cp = s.codePointAt(i);
|
||||
|
||||
|
||||
if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
|
||||
i += Character.charCount(cp);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -175,17 +181,20 @@ public class DocumentPositionMapper {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
if (wordPartSeparator.indexOf(s.codePointAt(i)) < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
for (int run = 0; run < 10 && i < s.length(); run++) {
|
||||
int cp = s.codePointAt(i);
|
||||
|
||||
if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
|
||||
i += Character.charCount(cp);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -193,48 +202,4 @@ public class DocumentPositionMapper {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Helper class to record spans of words */
|
||||
private static class SpanRecorder {
|
||||
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
|
||||
private final HtmlTag htmlTag;
|
||||
private int start = 0;
|
||||
|
||||
public SpanRecorder(HtmlTag htmlTag) {
|
||||
this.htmlTag = htmlTag;
|
||||
}
|
||||
|
||||
public void update(DocumentSentence sentence, int pos) {
|
||||
assert pos > 0;
|
||||
|
||||
if (sentence.htmlTags.contains(htmlTag)) {
|
||||
if (start <= 0) start = pos;
|
||||
}
|
||||
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
|
||||
{
|
||||
// special case for body tag, we match against no tag on the sentence
|
||||
if (start <= 0) start = pos;
|
||||
}
|
||||
else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endCurrentSpan(int pos) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
}
|
||||
}
|
@@ -6,18 +6,24 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
|
||||
import nu.marginalia.keyword.extractors.UrlKeywords;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public class KeywordMetadata {
|
||||
|
||||
@Nullable
|
||||
private final TitleKeywords titleKeywords;
|
||||
@Nullable
|
||||
private final NameLikeKeywords nameLikeKeywords;
|
||||
@Nullable
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
@Nullable
|
||||
private final UrlKeywords urlKeywords;
|
||||
|
||||
public KeywordMetadata(
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords) {
|
||||
@Nullable TitleKeywords titleKeywords,
|
||||
@Nullable NameLikeKeywords nameLikeKeywords,
|
||||
@Nullable SubjectLikeKeywords subjectLikeKeywords,
|
||||
@Nullable UrlKeywords urlKeywords) {
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
@@ -32,23 +38,23 @@ public class KeywordMetadata {
|
||||
|
||||
byte flags = 0;
|
||||
|
||||
if (subjectLikeKeywords.contains(stemmed)) {
|
||||
if (subjectLikeKeywords != null && subjectLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Subjects.asBit();
|
||||
}
|
||||
|
||||
if (nameLikeKeywords.contains(stemmed)) {
|
||||
if (nameLikeKeywords != null && nameLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.NamesWords.asBit();
|
||||
}
|
||||
|
||||
if (titleKeywords.contains(stemmed)) {
|
||||
if (titleKeywords != null && titleKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Title.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsUrl(stemmed)) {
|
||||
if (urlKeywords != null && urlKeywords.containsUrl(stemmed)) {
|
||||
flags |= WordFlags.UrlPath.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsDomain(stemmed)) {
|
||||
if (urlKeywords != null && urlKeywords.containsDomain(stemmed)) {
|
||||
flags |= WordFlags.UrlDomain.asBit();
|
||||
}
|
||||
|
@@ -0,0 +1,52 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helper class to record spans of words
|
||||
*/
|
||||
class SpanRecorder {
|
||||
private final List<DocumentWordSpan> spans = new ArrayList<>();
|
||||
private final HtmlTag htmlTag;
|
||||
private int start = 0;
|
||||
|
||||
public SpanRecorder(HtmlTag htmlTag) {
|
||||
this.htmlTag = htmlTag;
|
||||
}
|
||||
|
||||
public void update(DocumentSentence sentence, int pos) {
|
||||
assert pos > 0;
|
||||
|
||||
if (sentence.htmlTags.contains(htmlTag)) {
|
||||
if (start <= 0) start = pos;
|
||||
} else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) {
|
||||
// special case for body tag, we match against no tag on the sentence
|
||||
if (start <= 0) start = pos;
|
||||
} else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endCurrentSpan(int pos) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public List<DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
}
|
@@ -2,11 +2,11 @@ package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -16,12 +16,14 @@ public class NameLikeKeywords implements WordReps {
|
||||
private final List<WordRep> nameWords;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public NameLikeKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData dld, int minCount) {
|
||||
var counts = new Object2IntOpenHashMap<String>(100);
|
||||
var instances = new HashMap<String, HashSet<WordRep>>(100);
|
||||
public NameLikeKeywords(DocumentLanguageData dld, int minCount) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<String>(100);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<String, HashSet<WordRep>>(100);
|
||||
|
||||
for (DocumentSentence sent : dld) {
|
||||
var keywords = keywordExtractor.getProperNames(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.NAME);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1 && sent.isAllCaps(span.start))
|
||||
continue;
|
@@ -1,11 +1,11 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
@@ -23,25 +23,18 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
// Greeks bearing gifts -> Greeks
|
||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||
|
||||
public SubjectLikeKeywords(KeywordExtractor keywordExtractor,
|
||||
WordsTfIdfCounts tfIdfCounts,
|
||||
public SubjectLikeKeywords(WordsTfIdfCounts tfIdfCounts,
|
||||
DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld) {
|
||||
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
|
||||
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1))
|
||||
for (WordSpan kw : languageDefinition.matchGrammarPattern(sentence, PosPatternCategory.NOUN)) {
|
||||
if (sentence.nextCommaPos(kw.end - 1) <= kw.end)
|
||||
continue;
|
||||
|
||||
String nextTag = sentence.posTags[kw.end];
|
||||
String nextNextTag = sentence.posTags[kw.end+1];
|
||||
|
||||
if (isVerb(nextTag) && isDetOrAdverbOrVerbOrNoun(nextNextTag)) {
|
||||
if (languageDefinition.matchGrammarPattern(sentence, PosPatternCategory.SUBJECT_SUFFIX, kw.end)) {
|
||||
var span = new WordSpan(kw.start, kw.end);
|
||||
var rep = new WordRep(sentence, span);
|
||||
|
||||
@@ -94,17 +87,4 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
return tfIdfCounts.getTfIdf(stemmed);
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerbOrNoun(String posTag) {
|
||||
return "DT".equals(posTag) // determinant
|
||||
|| posTag.startsWith("RB") // adverb
|
||||
|| posTag.startsWith("VB") // verb
|
||||
|| posTag.startsWith("JJ") // adjective
|
||||
|| posTag.startsWith("P")
|
||||
|| posTag.startsWith("NN");
|
||||
}
|
||||
|
||||
boolean isVerb(String posTag) {
|
||||
return posTag.startsWith("VB")
|
||||
&& !posTag.equals("VB"); // not interested in the infinitive
|
||||
}
|
||||
}
|
@@ -1,8 +1,7 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
@@ -15,10 +14,12 @@ public class TitleKeywords implements WordReps {
|
||||
private final Set<WordRep> titleKeywords;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) {
|
||||
titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream()
|
||||
public TitleKeywords(DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
titleKeywords = dld.findSentencesForTag(HtmlTag.TITLE).stream()
|
||||
.flatMap(sent ->
|
||||
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
languageDefinition.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
.limit(100)
|
||||
.collect(Collectors.toSet());
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@@ -26,14 +26,13 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
private final Set<WordRep> tfIdfHigh;
|
||||
|
||||
public WordsTfIdfCounts(TermFrequencyDict dict,
|
||||
KeywordExtractor keywordExtractor,
|
||||
DocumentLanguageData dld) {
|
||||
this.dict = dict;
|
||||
this.docCount = dict.docCount();
|
||||
|
||||
this.tfIdf = new Object2IntOpenHashMap<>(10_000);
|
||||
this.tfIdfHigh = new HashSet<>(100);
|
||||
|
||||
var counts = getCounts(keywordExtractor, dld);
|
||||
var counts = getCounts(dld);
|
||||
int maxVal = maxValue(counts);
|
||||
Set<String> highTfIdfInstances = new HashSet<>();
|
||||
|
||||
@@ -48,9 +47,10 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
|
||||
// Collect words with a high TF-IDF so that they can be marked with a bit flag
|
||||
|
||||
tfIdfHigh = new HashSet<>(100);
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
for (var sent : dld) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.KEYWORD);
|
||||
for (var span : keywords) {
|
||||
if (highTfIdfInstances.contains(sent.constructStemmedWordFromSpan(span))) {
|
||||
tfIdfHigh.add(new WordRep(sent, span));
|
||||
@@ -60,12 +60,14 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
|
||||
}
|
||||
|
||||
private Object2IntOpenHashMap<String> getCounts(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
|
||||
private Object2IntOpenHashMap<String> getCounts(DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
counts.defaultReturnValue(0);
|
||||
|
||||
for (var sent : dld) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.KEYWORD);
|
||||
for (var span : keywords) {
|
||||
counts.addTo(sent.constructStemmedWordFromSpan(span), 1);
|
||||
}
|
@@ -0,0 +1,23 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DocumentKeywords(List<String> keywords,
|
||||
byte[] metadata,
|
||||
List<VarintCodedSequence> positions,
|
||||
byte[] spanCodes,
|
||||
List<VarintCodedSequence> spanSequences) {
|
||||
|
||||
public boolean isEmpty() {
|
||||
return keywords.isEmpty();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return keywords.size();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@@ -5,13 +5,11 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.*;
|
||||
|
||||
public class DocumentKeywordsBuilder {
|
||||
@@ -29,6 +27,7 @@ public class DocumentKeywordsBuilder {
|
||||
// be plenty. The lexicon writer has another limit that's higher.
|
||||
private final int MAX_WORD_LENGTH = 64;
|
||||
private final int MAX_POSITIONS_PER_WORD = 512;
|
||||
private final int MAX_SPANS_PER_TYPE = 8192;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DocumentKeywordsBuilder.class);
|
||||
|
||||
@@ -36,13 +35,22 @@ public class DocumentKeywordsBuilder {
|
||||
this(1600);
|
||||
}
|
||||
|
||||
public DocumentKeywords build(ByteBuffer workArea) {
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywords build() {
|
||||
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
||||
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
||||
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
||||
final List<VarintCodedSequence> spanSequences = new ArrayList<>(wordSpans.size());
|
||||
final byte[] spanCodes = new byte[wordSpans.size()];
|
||||
|
||||
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
|
||||
|
||||
// Encode positions
|
||||
while (iter.hasNext()) {
|
||||
var entry = iter.next();
|
||||
|
||||
@@ -59,27 +67,26 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
// Encode spans
|
||||
List<CodedWordSpan> spans = new ArrayList<>(wordSpans.size());
|
||||
|
||||
wordSpans.forEach((tag, spansForTag) -> {
|
||||
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
|
||||
|
||||
var positionsForTag = new IntArrayList(spansForTag.size() * 2);
|
||||
|
||||
for (var span : spansForTag) {
|
||||
positionsForTag.add(span.start());
|
||||
positionsForTag.add(span.end());
|
||||
|
||||
if (positionsForTag.size() >= MAX_SPANS_PER_TYPE)
|
||||
break;
|
||||
}
|
||||
|
||||
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
|
||||
spanCodes[spanSequences.size()] = tag.code;
|
||||
spanSequences.add(VarintCodedSequence.generate(positionsForTag));
|
||||
});
|
||||
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spanCodes, spanSequences);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
public void addMeta(String word, byte meta) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
@@ -113,6 +120,13 @@ public class DocumentKeywordsBuilder {
|
||||
newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta));
|
||||
}
|
||||
|
||||
public void addSyntheticTerm(String newWord) {
|
||||
byte meta = WordFlags.Synthetic.asBit();
|
||||
|
||||
wordToMeta.putIfAbsent(newWord, meta);
|
||||
}
|
||||
|
||||
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
@@ -167,6 +181,4 @@ public class DocumentKeywordsBuilder {
|
||||
return this.importantWords;
|
||||
}
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
||||
}
|
@@ -0,0 +1,6 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
@@ -0,0 +1,171 @@
|
||||
package nu.marginalia.language;
|
||||
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
public class LanguageProcessingTool extends Jooby {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final TermFrequencyDict termFrequencyDict;
|
||||
static void main(String[] args) {
|
||||
Jooby.runApp(args, LanguageProcessingTool::new);
|
||||
}
|
||||
|
||||
public LanguageProcessingTool() {
|
||||
try {
|
||||
LanguageModels languageModels = getLanguageModels();
|
||||
termFrequencyDict = new TermFrequencyDict(languageModels);
|
||||
|
||||
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
|
||||
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
|
||||
languageModels
|
||||
);
|
||||
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
|
||||
System.out.println("Base path: " + basePath);
|
||||
|
||||
if (Files.exists(basePath.resolve("resources/ltt/jte")))
|
||||
install(new nu.marginalia.service.server.jte.JteModule(basePath.resolve("resources/ltt/jte")));
|
||||
if (Files.exists(basePath.resolve("resources/ltt/static")))
|
||||
assets("/*", basePath.resolve("resources/ltt/static"));
|
||||
|
||||
get("/", this::handleKeywords);
|
||||
post("/", this::handleKeywords);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to initialize LanguageProcessingTool", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
// Assign colors to the POS tags
|
||||
|
||||
@NotNull
|
||||
private ModelAndView<?> handleKeywords(Context context) throws URISyntaxException {
|
||||
if ("GET".equals(context.getMethod())) {
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", "");
|
||||
}
|
||||
else if (!"POST".equals(context.getMethod())) {
|
||||
throw new IllegalArgumentException("Invalid method");
|
||||
}
|
||||
|
||||
String textSample = context.form("textSample").value();
|
||||
|
||||
// Run sentende extration on the text as-is
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
|
||||
|
||||
// Run individual extraction logic
|
||||
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, dld);
|
||||
var titleKeywords = new TitleKeywords(dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
|
||||
// Run full extraction logic to capture positioning etc
|
||||
var extractedKeywords = new DocumentKeywordExtractor(termFrequencyDict)
|
||||
.extractKeywords(dld, new LinkTexts(), new EdgeUrl("https://www.example.com/"));
|
||||
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", textSample)
|
||||
.put("language", dld.language())
|
||||
.put("tagColors", posTagStyles(dld))
|
||||
.put("sentences", dld.sentences())
|
||||
.put("tfIdfReps", tfIdfCounts.getReps())
|
||||
.put("titleReps", titleKeywords.getReps())
|
||||
.put("nameLikeReps", nameLikeKeywords.getReps())
|
||||
.put("subjectLikeReps", subjectLikeKeywords.getReps())
|
||||
.put("artifacts", artifactKeywords.getWords())
|
||||
.put("importantWords", extractedKeywords.importantWords)
|
||||
.put("positionedWords", extractedKeywords.wordToPos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate unique colors for each POS tag, to help the UI rendering
|
||||
*/
|
||||
public static Map<Long, String> posTagStyles(DocumentLanguageData dld) {
|
||||
Map<Long, String> styles = new HashMap<>();
|
||||
|
||||
// we sort them first to ensure the most common tags are guaranteed to have
|
||||
// the largest difference between colors
|
||||
|
||||
Map<Long, Integer> counts = new HashMap<>();
|
||||
for (var sentence : dld.sentences()) {
|
||||
for (var tag : sentence.posTags) {
|
||||
counts.merge(tag, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
|
||||
List<Long> posTagsByCount = counts
|
||||
.entrySet().stream()
|
||||
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
|
||||
.map(Map.Entry::getKey)
|
||||
.toList();
|
||||
|
||||
|
||||
for (int i = 0; i < posTagsByCount.size(); i++) {
|
||||
String style = "text-" + switch (i&0x7) {
|
||||
case 0 -> "red";
|
||||
case 1 -> "green";
|
||||
case 2 -> "blue";
|
||||
case 3 -> "yellow";
|
||||
case 4 -> "purple";
|
||||
case 5 -> "cyan";
|
||||
case 6 -> "pink";
|
||||
default -> "gray";
|
||||
}+"-"+switch((i/8) & 3) {
|
||||
case 0 -> "900";
|
||||
case 3 -> "500";
|
||||
case 1 -> "750";
|
||||
case 2 -> "400";
|
||||
default -> "300";
|
||||
};
|
||||
styles.put(posTagsByCount.get(i), style);
|
||||
}
|
||||
return styles;
|
||||
}
|
||||
|
||||
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
||||
private static Path getLanguageModelsPath() {
|
||||
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
||||
.map(Path::of)
|
||||
.orElse(LANGUAGE_MODELS_DEFAULT);
|
||||
|
||||
if (!Files.isDirectory(languageModelsHome)) {
|
||||
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
||||
}
|
||||
return languageModelsHome;
|
||||
}
|
||||
private static LanguageModels getLanguageModels() {
|
||||
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("lid.176.ftz"),
|
||||
languageModelsHome.resolve("segments.bin")
|
||||
);
|
||||
}
|
||||
}
|
@@ -0,0 +1,43 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
sealed public interface LanguageConfigLocation {
|
||||
InputStream findLanguageConfiguration() throws IOException;
|
||||
|
||||
final class Auto implements LanguageConfigLocation {
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
Path filesystemPath = WmsaHome.getLangugeConfig();
|
||||
if (Files.exists(filesystemPath)) {
|
||||
return Files.newInputStream(filesystemPath, StandardOpenOption.READ);
|
||||
}
|
||||
if (Boolean.getBoolean("language.experimental")) {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
} else {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final class Experimental implements LanguageConfigLocation {
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
}
|
||||
}
|
||||
|
||||
final class Default implements LanguageConfigLocation {
|
||||
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,405 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import com.github.jfasttext.JFastText;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.pos.PosTagger;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.security.DigestInputStream;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.*;
|
||||
|
||||
@Singleton
|
||||
public class LanguageConfiguration {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageConfiguration.class);
|
||||
|
||||
private final Map<String, Path> resources = new HashMap<>();
|
||||
private final Map<String, LanguageDefinition> languages = new LinkedHashMap<>();
|
||||
private final JFastText fastTextLanguageModel = new JFastText();
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(org.jsoup.nodes.Document jsoupDoc) {
|
||||
StringBuilder sampleBuilder = new StringBuilder();
|
||||
jsoupDoc.body().traverse((node, _) -> {
|
||||
if (sampleBuilder.length() > 4096)
|
||||
return;
|
||||
if (!(node instanceof TextNode tn))
|
||||
return;
|
||||
|
||||
sampleBuilder.append(' ').append(tn.text());
|
||||
});
|
||||
return identifyLanguage(sampleBuilder.toString());
|
||||
}
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample) {
|
||||
String prediction = fastTextLanguageModel.predict(sample);
|
||||
if (null == prediction)
|
||||
return Optional.empty();
|
||||
|
||||
if (prediction.length() == "__label__??".length()) {
|
||||
String isoCode = prediction.substring("__label__".length());
|
||||
return Optional.ofNullable(getLanguage(isoCode));
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample, String fallbackIsoCode) {
|
||||
return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
|
||||
}
|
||||
|
||||
public List<LanguageDefinition> languages() {
|
||||
return new ArrayList<>(this.languages.values());
|
||||
}
|
||||
public Map<String, LanguageDefinition> languagesMap() {
|
||||
return Collections.unmodifiableMap(languages);
|
||||
}
|
||||
@Nullable
|
||||
public LanguageDefinition getLanguage(String language) {
|
||||
return languages.get(language);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public LanguageConfiguration() throws IOException, ParserConfigurationException, SAXException {
|
||||
this(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Auto());
|
||||
}
|
||||
|
||||
public LanguageConfiguration(LanguageConfigLocation languageFile) throws IOException, ParserConfigurationException, SAXException {
|
||||
this(WmsaHome.getLanguageModels(), languageFile);
|
||||
}
|
||||
|
||||
public LanguageConfiguration(LanguageModels lm, LanguageConfigLocation languageFile)
|
||||
throws IOException, ParserConfigurationException, SAXException {
|
||||
fastTextLanguageModel.loadModel(lm.fasttextLanguageModel.toString());
|
||||
|
||||
try (var languagesXmlStream = languageFile.findLanguageConfiguration()) {
|
||||
if (languagesXmlStream == null)
|
||||
throw new IllegalStateException("languages-default.xml resource not found in classpath");
|
||||
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
Document doc = builder.parse(languagesXmlStream);
|
||||
|
||||
parseResources(doc);
|
||||
parseLanguages(doc);
|
||||
}
|
||||
|
||||
logger.info("Loaded language configuration: {}", languages);
|
||||
}
|
||||
|
||||
private void parseLanguages(Document doc) {
|
||||
NodeList languageNodes = doc.getElementsByTagName("language");
|
||||
|
||||
for (int i = 0; i < languageNodes.getLength(); i++) {
|
||||
Element languageTag = (Element) languageNodes.item(i);
|
||||
|
||||
boolean disabled = "TRUE".equalsIgnoreCase(languageTag.getAttribute("disabled"));
|
||||
if (disabled)
|
||||
continue;
|
||||
|
||||
String isoCode = languageTag.getAttribute("isoCode").toLowerCase();
|
||||
String name = languageTag.getAttribute("name");
|
||||
|
||||
try {
|
||||
PosTagger posTagger = parsePosTag(languageTag, isoCode);
|
||||
Stemmer stemmer = parseStemmerTag(languageTag, posTagger, isoCode);
|
||||
KeywordHasher keywordHasher = parseHasherTag(languageTag, isoCode);
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns =
|
||||
parsePosPatterns(posTagger, languageTag, isoCode);
|
||||
UnicodeNormalization unicodeNormalization = parseUnicodeNormalization(languageTag, isoCode);
|
||||
|
||||
languages.put(isoCode,
|
||||
new LanguageDefinition(isoCode, name, stemmer, unicodeNormalization, keywordHasher, posTagger, posPatterns));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to set up language " + isoCode, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private UnicodeNormalization parseUnicodeNormalization(Element languageTag, String isoCode) {
|
||||
NodeList normalizationTags = languageTag.getElementsByTagName("unicodeNormalization");
|
||||
if (normalizationTags.getLength() == 0)
|
||||
return new UnicodeNormalization.JustNormalizeQuotes();
|
||||
Element normalizationTag = (Element) normalizationTags.item(0);
|
||||
String algorithm = normalizationTag.getAttribute("algorithm");
|
||||
|
||||
return switch(algorithm) {
|
||||
case "minimal" -> new UnicodeNormalization.JustNormalizeQuotes();
|
||||
case "e-accents" -> new UnicodeNormalization.FlattenEAccents();
|
||||
case "german" -> new UnicodeNormalization.Flattenß();
|
||||
case "maximal-latin" -> new UnicodeNormalization.FlattenAllLatin();
|
||||
default -> throw new IllegalArgumentException("Invalida algorithm " + algorithm + " on language configuration for " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
|
||||
Element languageTag, String isoCode) {
|
||||
if (null == posTagger)
|
||||
return Map.of();
|
||||
|
||||
Map<PosPatternCategory, List<PosPattern>> ret = new HashMap<>();
|
||||
NodeList ngramsElements = languageTag.getElementsByTagName("ngrams");
|
||||
|
||||
for (int i = 0; i < ngramsElements.getLength(); i++) {
|
||||
Element ngramsTag = (Element) ngramsElements.item(i);
|
||||
String type = ngramsTag.getAttribute("type");
|
||||
|
||||
PosPatternCategory category = switch(type) {
|
||||
case "name" -> PosPatternCategory.NAME;
|
||||
case "noun" -> PosPatternCategory.NOUN;
|
||||
case "keyword" -> PosPatternCategory.KEYWORD;
|
||||
case "title" -> PosPatternCategory.TITLE;
|
||||
case "subject-suffix" -> PosPatternCategory.SUBJECT_SUFFIX;
|
||||
default -> throw new IllegalArgumentException("Invalid ngrams type in " + isoCode + ", what is '" + type + "'?");
|
||||
};
|
||||
|
||||
NodeList posPatternsList = ngramsTag.getElementsByTagName("pospattern");
|
||||
for (int j = 0; j < posPatternsList.getLength(); j++) {
|
||||
Element posPatternTag = (Element) posPatternsList.item(j);
|
||||
ret.computeIfAbsent(category, (k) -> new ArrayList<>())
|
||||
.add(new PosPattern(posTagger, posPatternTag.getTextContent()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private PosTagger parsePosTag(Element languageTag, String isoCode) throws IOException {
|
||||
NodeList rdrElements = languageTag.getElementsByTagName("rdrTagger");
|
||||
if (rdrElements.getLength() < 1) {
|
||||
return null;
|
||||
}
|
||||
else if (rdrElements.getLength() > 1) {
|
||||
throw new IllegalStateException("Multiple rdr taggers defined in " + isoCode);
|
||||
}
|
||||
Element rdrElement = (Element) rdrElements.item(0);
|
||||
|
||||
String dictId = rdrElement.getAttribute("dictId");
|
||||
String rdrId = rdrElement.getAttribute("rdrId");
|
||||
|
||||
Path dictPath = resources.get(dictId);
|
||||
Path rdrPath = resources.get(rdrId);
|
||||
|
||||
if (null == dictPath)
|
||||
throw new IllegalArgumentException("language.xml: dictPath id " + dictId
|
||||
+ " does not map to a resource in " + isoCode);
|
||||
if (null == rdrPath)
|
||||
throw new IllegalArgumentException("language.xml: rdrPath id " + dictId
|
||||
+ " does not map to a resource in " + isoCode);
|
||||
|
||||
return new PosTagger(isoCode, dictPath, rdrPath);
|
||||
}
|
||||
|
||||
|
||||
private KeywordHasher parseHasherTag(Element languageElement, String isoCode) {
|
||||
NodeList keywordHasherElements = languageElement.getElementsByTagName("keywordHash");
|
||||
if (keywordHasherElements.getLength() != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"language.xml: No keywordHasher block for language element " + isoCode);
|
||||
}
|
||||
Element keywordHasheElement = (Element) keywordHasherElements.item(0);
|
||||
|
||||
String hasherName = keywordHasheElement.getAttribute("algorithm");
|
||||
|
||||
return switch (hasherName) {
|
||||
case "asciish" -> new KeywordHasher.AsciiIsh();
|
||||
case "utf8" -> new KeywordHasher.Utf8();
|
||||
default -> throw new IllegalArgumentException(
|
||||
"language.xml: Unknown keywordHash name " + hasherName + " in " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private Stemmer parseStemmerTag(Element languageElement, PosTagger posTagger, String isoCode) {
|
||||
NodeList stemmerElements = languageElement.getElementsByTagName("stemmer");
|
||||
if (stemmerElements.getLength() != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"language.xml: No stemmer block for language element " + isoCode);
|
||||
}
|
||||
Element stemmerElement = (Element) stemmerElements.item(0);
|
||||
|
||||
String stemmerName = stemmerElement.getAttribute("algorithm");
|
||||
String stemmerVariant = stemmerElement.getAttribute("variant");
|
||||
|
||||
PosPattern inclusionPattern = null;
|
||||
NodeList posPatternList = stemmerElement.getElementsByTagName("pospattern");
|
||||
if (posPatternList.getLength() >= 1) {
|
||||
Element posElement = (Element) posPatternList.item(0);
|
||||
inclusionPattern = new PosPattern(posTagger, posElement.getTextContent());
|
||||
}
|
||||
|
||||
return switch (stemmerName.toLowerCase()) {
|
||||
case "porter" -> new Stemmer.Porter(inclusionPattern);
|
||||
case "snowball" -> new Stemmer.Snowball(stemmerVariant, inclusionPattern);
|
||||
case "none" -> new Stemmer.NoOpStemmer();
|
||||
default -> throw new IllegalArgumentException(
|
||||
"language.xml: Unknown stemmer name " + stemmerName + " in " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private void parseResources(Document doc) throws IOException {
|
||||
NodeList resourceNodes = doc.getElementsByTagName("resource");
|
||||
for (int i = 0; i < resourceNodes.getLength(); i++) {
|
||||
Element resourceTag = (Element) resourceNodes.item(i);
|
||||
|
||||
String resourceId = resourceTag.getAttribute("id");
|
||||
String resourceMd5 = resourceTag.getAttribute("md5");
|
||||
Path resourcePath = WmsaHome.getDataPath().resolve(resourceTag.getAttribute("path"));
|
||||
String resourceHref = resourceTag.getAttribute("href");
|
||||
|
||||
if (!validateResource(resourcePath, resourceMd5)) {
|
||||
boolean success = false;
|
||||
try {
|
||||
success = fetchResource(resourceHref, resourcePath, resourceMd5);
|
||||
} catch (URISyntaxException | IOException ex) {
|
||||
logger.error(ex.getMessage(), ex);
|
||||
success = false;
|
||||
}
|
||||
|
||||
// It's likely if we were to just explode here, that a docker-compose restart:always
|
||||
// would put us in a
|
||||
// loop that repeatedly fails to download the same file. We'd like to avoid that by
|
||||
// stalling and
|
||||
// awaiting human intervention.
|
||||
|
||||
while (!success) {
|
||||
logger.error("Stopping to prevent restart loop");
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (resources.put(resourceId, resourcePath) != null)
|
||||
throw new IllegalStateException(
|
||||
"Resource with id " + resourceId + " already exists");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean fetchResource(String resourceUrl, Path resourcePath, String resourceMd5)
|
||||
throws IOException, URISyntaxException {
|
||||
|
||||
Path parentPath = resourcePath.getParent();
|
||||
if (!Files.isDirectory(parentPath)) {
|
||||
logger.info("Setting up directory {}", parentPath);
|
||||
Files.createDirectories(parentPath);
|
||||
}
|
||||
|
||||
logger.info("Fetching {}", resourceUrl);
|
||||
|
||||
URL url = new URI(resourceUrl).toURL();
|
||||
Path tempFile = Files.createTempFile("resource", "dat");
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
try (InputStream is = conn.getInputStream();
|
||||
OutputStream os = Files.newOutputStream(tempFile, StandardOpenOption.WRITE,
|
||||
StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
is.transferTo(os);
|
||||
os.flush();
|
||||
|
||||
String actualMd5 = getFileMD5(tempFile);
|
||||
if (!resourceMd5.isBlank() && !Objects.equals(resourceMd5, actualMd5)) {
|
||||
logger.error("Freshly downloaded resource {} does not match md5sum {}", resourceUrl,
|
||||
resourceMd5);
|
||||
return false;
|
||||
} else {
|
||||
logger.info("Downloaded resource {} to {} ** md5sum {}", resourceUrl, resourcePath,
|
||||
actualMd5);
|
||||
Files.move(tempFile, resourcePath, StandardCopyOption.REPLACE_EXISTING);
|
||||
return true;
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
logger.error("IOException", ex);
|
||||
return false;
|
||||
} finally {
|
||||
conn.disconnect();
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean validateResource(Path resourcePath, String providedMd5Sum) throws IOException {
|
||||
resourcePath = resourcePath.normalize();
|
||||
|
||||
if (!resourcePath.normalize().startsWith(WmsaHome.getDataPath()))
|
||||
throw new IllegalArgumentException(
|
||||
"Resource path has escaped $WMSA_HOME/data: " + resourcePath);
|
||||
if (!Files.exists(resourcePath)) {
|
||||
logger.info("Resource path does not exist: " + resourcePath);
|
||||
return false;
|
||||
}
|
||||
|
||||
String actualMd5 = getFileMD5(resourcePath);
|
||||
if (providedMd5Sum.isBlank()) {
|
||||
logger.info("No md5sum provided for resource path: {}, but was calculated to {}",
|
||||
resourcePath, actualMd5);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (Objects.equals(actualMd5, providedMd5Sum)) {
|
||||
return true;
|
||||
} else {
|
||||
logger.error("MD5 checksum mismatch for {} -- {}", resourcePath, providedMd5Sum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public String getFileMD5(Path filePath) {
|
||||
try (InputStream fis = Files.newInputStream(filePath)) {
|
||||
MessageDigest md = MessageDigest.getInstance("MD5");
|
||||
DigestInputStream dis = new DigestInputStream(fis, md);
|
||||
|
||||
// Read the file
|
||||
byte[] buffer = new byte[8192];
|
||||
while (dis.read(buffer) != -1) {
|
||||
// Reading updates the digest
|
||||
}
|
||||
|
||||
byte[] digest = md.digest();
|
||||
|
||||
// Convert to hex
|
||||
StringBuilder hexString = new StringBuilder();
|
||||
for (byte b : digest) {
|
||||
String hex = Integer.toHexString(0xff & b);
|
||||
if (hex.length() == 1) {
|
||||
hexString.append('0');
|
||||
}
|
||||
hexString.append(hex);
|
||||
}
|
||||
return hexString.toString();
|
||||
} catch (IOException | NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,227 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
public interface UnicodeNormalization {
|
||||
|
||||
String flattenUnicode(String s);
|
||||
|
||||
static final boolean NO_FLATTEN_UNICODE =
|
||||
Boolean.getBoolean("system.noFlattenUnicode");
|
||||
|
||||
class JustNormalizeQuotes implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenEAccents implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
for (int i = 0; i < numCp;) {
|
||||
int c = s.codePointAt(i);
|
||||
i+=Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("é".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class Flattenß implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
} else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenAllLatin implements UnicodeNormalization {
|
||||
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
// Falsehoods programmers believe about the latin alphabet ;-)
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
|
||||
sb.append('a');
|
||||
}
|
||||
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
|
||||
sb.append('b');
|
||||
}
|
||||
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
|
||||
sb.append('c');
|
||||
}
|
||||
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
|
||||
sb.append('d');
|
||||
}
|
||||
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else if ("ḟƒ".indexOf(c) >= 0) {
|
||||
sb.append('f');
|
||||
}
|
||||
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
|
||||
sb.append('i');
|
||||
}
|
||||
else if ("ĵǰɉ".indexOf(c) >= 0) {
|
||||
sb.append('j');
|
||||
}
|
||||
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
|
||||
sb.append('k');
|
||||
}
|
||||
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
|
||||
sb.append('l');
|
||||
}
|
||||
else if ("ḿṁṃ".indexOf(c) >= 0) {
|
||||
sb.append('m');
|
||||
}
|
||||
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
|
||||
sb.append('n');
|
||||
}
|
||||
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
|
||||
sb.append('o');
|
||||
}
|
||||
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
|
||||
sb.append('p');
|
||||
}
|
||||
else if ("ꝗ".indexOf(c) >= 0) {
|
||||
sb.append('q');
|
||||
}
|
||||
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
|
||||
sb.append('r');
|
||||
}
|
||||
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
|
||||
sb.append('s');
|
||||
}
|
||||
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
|
||||
sb.append('t');
|
||||
}
|
||||
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
|
||||
sb.append('u');
|
||||
}
|
||||
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
|
||||
sb.append('v');
|
||||
}
|
||||
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
|
||||
sb.append('w');
|
||||
}
|
||||
else if ("x̂ẍẋ".indexOf(c) >= 0) {
|
||||
sb.append('x');
|
||||
}
|
||||
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
|
||||
sb.append('y');
|
||||
}
|
||||
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
|
||||
sb.append('z');
|
||||
}
|
||||
else if ("Þþ".indexOf(c) >= 0) {
|
||||
sb.append("th");
|
||||
}
|
||||
else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else if (isAscii(c)) {
|
||||
sb.append((char) c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static boolean isPlainAscii(String s) {
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
if (!isAscii(c))
|
||||
return false;
|
||||
i += Character.charCount(c);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean isAscii(int c) {
|
||||
return (c & ~0x7f) == 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,27 @@
|
||||
package nu.marginalia.language.keywords;
|
||||
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
|
||||
public sealed interface KeywordHasher {
|
||||
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
|
||||
long hashKeyword(String keyword);
|
||||
|
||||
/** Hash algorithm that seeds a Murmur128 algorithm with Java's string hashCode(), but
|
||||
* then only looks at 7 bit ASCII for the Murmur calculations. This works well for English
|
||||
* and similar languages, but falls apart completely for languages that are not dominated by
|
||||
* the 7 bit ASCII subset.
|
||||
*/
|
||||
final class AsciiIsh implements KeywordHasher {
|
||||
public long hashKeyword(String keyword) {
|
||||
return hasher.hashNearlyASCII(keyword);
|
||||
}
|
||||
}
|
||||
|
||||
/** Hash algorithm that is based on Murmur128 folded over on itself to make a 64 bit key */
|
||||
final class Utf8 implements KeywordHasher {
|
||||
public long hashKeyword(String keyword) {
|
||||
return hasher.hashUtf8(keyword);
|
||||
}
|
||||
}
|
||||
}
|
@@ -15,11 +15,13 @@ import java.util.stream.Stream;
|
||||
*
|
||||
* @see SentenceExtractor
|
||||
*/
|
||||
public record DocumentLanguageData(List<DocumentSentence> sentences, String text) implements Iterable<DocumentSentence> {
|
||||
public record DocumentLanguageData(LanguageDefinition language,
|
||||
List<DocumentSentence> sentences,
|
||||
String text) implements Iterable<DocumentSentence> {
|
||||
|
||||
public DocumentLanguageData(List<DocumentSentence> sentences,
|
||||
String text)
|
||||
public DocumentLanguageData(LanguageDefinition language, List<DocumentSentence> sentences, String text)
|
||||
{
|
||||
this.language = language;
|
||||
this.sentences = Collections.unmodifiableList(sentences);
|
||||
this.text = text;
|
||||
}
|
@@ -19,13 +19,14 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
/** A span of words in a sentence */
|
||||
public final String[] wordsLowerCase;
|
||||
public final String[] stemmedWords;
|
||||
public final String[] posTags;
|
||||
public final long[] posTags;
|
||||
|
||||
/** A set of HTML tags that surround the sentence */
|
||||
public final EnumSet<HtmlTag> htmlTags;
|
||||
|
||||
/** A bitset indicating whether the word is a stop word */
|
||||
private final BitSet isStopWord;
|
||||
private final BitSet includeInStemming;
|
||||
|
||||
/** A bitset indicating whether the word is capitalized */
|
||||
private final BitSet isCapitalized;
|
||||
@@ -37,16 +38,16 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
// where false = COMMA, true = SPACE
|
||||
private final BitSet separators;
|
||||
|
||||
|
||||
public SoftReference<WordSpan[]> keywords;
|
||||
|
||||
public DocumentSentence(BitSet separators,
|
||||
String[] wordsLowerCase,
|
||||
String[] posTags,
|
||||
long[] posTags,
|
||||
String[] stemmedWords,
|
||||
EnumSet<HtmlTag> htmlTags,
|
||||
BitSet isCapitalized,
|
||||
BitSet isAllCaps
|
||||
BitSet isAllCaps,
|
||||
BitSet includeInStemming
|
||||
)
|
||||
{
|
||||
this.separators = separators;
|
||||
@@ -56,6 +57,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
this.htmlTags = htmlTags;
|
||||
this.isCapitalized = isCapitalized;
|
||||
this.isAllCaps = isAllCaps;
|
||||
this.includeInStemming = includeInStemming;
|
||||
|
||||
isStopWord = new BitSet(wordsLowerCase.length);
|
||||
|
||||
@@ -87,6 +89,16 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
return !separators.get(i);
|
||||
}
|
||||
|
||||
/** Returns the position of the next comma in the sentence,
|
||||
* or sentence.length() if no remaining commas exist.
|
||||
*/
|
||||
public int nextCommaPos(int pos) {
|
||||
int ret = separators.nextClearBit(pos);
|
||||
if (ret < 0)
|
||||
return separators.length();
|
||||
return ret;
|
||||
}
|
||||
|
||||
public String constructWordFromSpan(WordSpan span) {
|
||||
if (span.size() == 1) {
|
||||
return trimJunkCharacters(wordsLowerCase[span.start]);
|
||||
@@ -153,10 +165,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
}
|
||||
|
||||
private boolean includeInStemming(int i) {
|
||||
if (posTags[i].equals("IN") || posTags[i].equals("TO") || posTags[i].equals("CC") || posTags[i].equals("DT")) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
return includeInStemming.get(i);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -199,7 +208,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
}
|
||||
|
||||
public String wordLowerCase() { return wordsLowerCase[pos]; }
|
||||
public String posTag() { return posTags[pos]; }
|
||||
public long posTag() { return posTags[pos]; }
|
||||
public String stemmed() { return stemmedWords[pos]; }
|
||||
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
|
||||
|
@@ -0,0 +1,145 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.pos.PosTagger;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public final class LanguageDefinition {
|
||||
private final String isoCode;
|
||||
private final String name;
|
||||
private final Stemmer stemmer;
|
||||
private final UnicodeNormalization unicodeNormalization;
|
||||
private final KeywordHasher keywordHasher;
|
||||
|
||||
@Nullable
|
||||
private final PosTagger posTagger;
|
||||
|
||||
private final Map<PosPatternCategory, List<PosPattern>> posPatterns;
|
||||
public LanguageDefinition(String isoCode,
|
||||
String name,
|
||||
Stemmer stemmer,
|
||||
UnicodeNormalization unicodeNormalization,
|
||||
KeywordHasher keywordHasher,
|
||||
@Nullable PosTagger posTagger,
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns) {
|
||||
this.isoCode = isoCode;
|
||||
this.name = name;
|
||||
this.stemmer = stemmer;
|
||||
this.unicodeNormalization = unicodeNormalization;
|
||||
this.keywordHasher = keywordHasher;
|
||||
this.posTagger = posTagger;
|
||||
this.posPatterns = posPatterns;
|
||||
}
|
||||
|
||||
public String isoCode() {
|
||||
return isoCode;
|
||||
}
|
||||
|
||||
public String displayName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public Stemmer stemmer() {
|
||||
return stemmer;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public PosTagger posTagger() {
|
||||
return posTagger;
|
||||
}
|
||||
|
||||
public KeywordHasher keywordHasher() {
|
||||
return keywordHasher;
|
||||
}
|
||||
|
||||
public UnicodeNormalization unicodeNormalization() {
|
||||
return unicodeNormalization;
|
||||
}
|
||||
|
||||
public long[] posTagSentence(String[] words) {
|
||||
if (posTagger == null) return new long[0];
|
||||
return posTagger.tagSentence(words);
|
||||
}
|
||||
|
||||
public boolean hasPosParsing() {
|
||||
return posTagger != null;
|
||||
}
|
||||
|
||||
public List<PosPattern> getPosPatterns(PosPatternCategory category) {
|
||||
return posPatterns.getOrDefault(category, List.of());
|
||||
}
|
||||
|
||||
public String decodePosTagName(long tagName) {
|
||||
if (hasPosParsing())
|
||||
return posTagger.decodeTagName(tagName);
|
||||
return "";
|
||||
}
|
||||
|
||||
public List<WordSpan> matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category) {
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (PosPattern pattern : getPosPatterns(category)) {
|
||||
pattern.matchSentence(sentence, spans);
|
||||
}
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
public boolean matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category, int pos) {
|
||||
for (var pattern : getPosPatterns(category)) {
|
||||
if (pattern.isMatch(sentence, pos))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category, WordSpan span) {
|
||||
for (var pattern : getPosPatterns(category)) {
|
||||
if (pattern.size() != span.size())
|
||||
continue;
|
||||
|
||||
if (pattern.isMatch(sentence, span.start))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public List<WordSpan> getWordsFromSentence(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>();
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
for (int i = k; i < sentence.length(); i++) {
|
||||
var w = new WordSpan(i-k, i + 1);
|
||||
|
||||
if (isViableSpanForWord(sentence, w)) {
|
||||
spans.add(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) {
|
||||
|
||||
if (sentence.nextCommaPos(w.start) < w.end - 1)
|
||||
return false;
|
||||
|
||||
if (!matchGrammarPattern(sentence, PosPatternCategory.TITLE, w))
|
||||
return false;
|
||||
|
||||
String word = sentence.constructWordFromSpan(w);
|
||||
return !word.isBlank() && WordPatterns.isNotJunkWord(word);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
public class UnsupportedLanguageException extends Exception {
|
||||
}
|
@@ -7,6 +7,8 @@ public class WordSpan implements Comparable<WordSpan> {
|
||||
public final int end;
|
||||
|
||||
public WordSpan(int start, int end) {
|
||||
assert end >= start;
|
||||
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
@@ -0,0 +1,236 @@
|
||||
package nu.marginalia.language.pos;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class PosPattern {
|
||||
public final LongArrayList pattern = new LongArrayList();
|
||||
private static final Logger logger = LoggerFactory.getLogger(PosPattern.class);
|
||||
|
||||
public long[] toArray() {
|
||||
return pattern.toLongArray();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return pattern.size();
|
||||
}
|
||||
|
||||
public PosPattern(PosTagger posTagger, String expression) {
|
||||
for (List<String> variants : PosTagPatternParser.parse(posTagger, expression)) {
|
||||
pattern.add(posTagger.encodeTagNames(variants));
|
||||
}
|
||||
|
||||
if (pattern.isEmpty()) {
|
||||
throw new IllegalArgumentException("Zero length patterns are not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
public int matchSentence(DocumentSentence sentence, List<WordSpan> ret) {
|
||||
long first = pattern.getLong(0);
|
||||
int cnt = 0;
|
||||
|
||||
// Fast case for 1-length patterns
|
||||
if (pattern.size() == 1) {
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (0L == (sentence.posTags[i] & first)) continue;
|
||||
ret.add(new WordSpan(i, i+1));
|
||||
cnt++;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
pattern:
|
||||
for (int i = 0; i <= sentence.length() - pattern.size(); i++) {
|
||||
|
||||
// Start by matching against the beginning of the pattern
|
||||
// as a fast path
|
||||
if (0L == (sentence.posTags[i] & first)) continue;
|
||||
|
||||
|
||||
int j;
|
||||
for (j = 1; j < pattern.size(); j++) {
|
||||
if (0L == (sentence.posTags[i + j] & pattern.getLong(j)))
|
||||
continue pattern;
|
||||
}
|
||||
|
||||
// Ensure no commas exist in the sentence except for the last word
|
||||
int nextCommaPos = sentence.nextCommaPos(i);
|
||||
if (nextCommaPos < i + pattern.size() - 1) {
|
||||
// note the i++ in the for loop will also be added here, so we're positioned after the next comma
|
||||
// beginning of the next iteration
|
||||
i = nextCommaPos;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Finally add the span
|
||||
ret.add(new WordSpan(i, i+j));
|
||||
cnt++;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
public boolean isMatch(DocumentSentence sentence, int pos) {
|
||||
if (pos + pattern.size() > sentence.length()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long first = pattern.getLong(0);
|
||||
if (0 == (sentence.posTags[pos] & first)) return false;
|
||||
else if (pattern.size() == 1) return true;
|
||||
|
||||
int nextCommaPos = sentence.nextCommaPos(pos);
|
||||
if (nextCommaPos < pos + pattern.size() - 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int j = 1; j < pattern.size(); j++) {
|
||||
if (0L == (sentence.posTags[pos+j] & pattern.getLong(j)))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Return a bit set for every position where this pattern matches the tag sequence provided */
|
||||
public BitSet matchTagPattern(long[] tags) {
|
||||
BitSet bs = new BitSet(tags.length);
|
||||
|
||||
// Fast case for length = 1
|
||||
if (pattern.size() == 1) {
|
||||
long patternVal = pattern.getLong(0);
|
||||
|
||||
for (int i = 0; i < tags.length; i++) {
|
||||
bs.set(i, (patternVal & tags[i]) != 0L);
|
||||
}
|
||||
|
||||
return bs;
|
||||
}
|
||||
|
||||
pattern:
|
||||
for (int i = 0; i <= tags.length - pattern.size(); i++) {
|
||||
int j;
|
||||
|
||||
for (j = 0; j < pattern.size(); j++) {
|
||||
if (0L == (tags[i+j] & pattern.getLong(j)))
|
||||
continue pattern;
|
||||
}
|
||||
|
||||
bs.set(i);
|
||||
}
|
||||
|
||||
return bs;
|
||||
}
|
||||
}
|
||||
|
||||
class PosTagPatternParser {
|
||||
private boolean inverted;
|
||||
private boolean inParen;
|
||||
|
||||
private final List<List<String>> variants = new ArrayList<>();
|
||||
private final List<String> allTags;
|
||||
|
||||
public PosTagPatternParser(PosTagger posTagger) {
|
||||
allTags = Collections.unmodifiableList(posTagger.tags());
|
||||
}
|
||||
|
||||
public static List<List<String>> parse(PosTagger posTagger, String expression) {
|
||||
|
||||
PosTagPatternParser patternBuilder = new PosTagPatternParser(posTagger);
|
||||
|
||||
for (String token : tokenize(expression)) {
|
||||
switch (token) {
|
||||
case "!" -> patternBuilder.invert();
|
||||
case "(" -> patternBuilder.parenOpen();
|
||||
case ")" -> patternBuilder.parenClose();
|
||||
default -> patternBuilder.addToken(token);
|
||||
}
|
||||
}
|
||||
|
||||
return patternBuilder.variants;
|
||||
}
|
||||
|
||||
private static List<String> tokenize(String expression) {
|
||||
List<String> tokens = new ArrayList<>();
|
||||
int pos = 0;
|
||||
|
||||
while (pos < expression.length()) {
|
||||
char c = expression.charAt(pos);
|
||||
if ("()!".indexOf(c) >= 0) {
|
||||
tokens.add(expression.substring(pos, pos + 1));
|
||||
pos++;
|
||||
}
|
||||
else if (Character.isSpaceChar(c)) {
|
||||
pos++;
|
||||
}
|
||||
else {
|
||||
int end = pos + 1;
|
||||
while (end < expression.length()) {
|
||||
int ce = expression.charAt(end);
|
||||
if ("() ".indexOf(ce) >= 0) {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
end++;
|
||||
}
|
||||
}
|
||||
tokens.add(expression.substring(pos, end));
|
||||
pos = end;
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
|
||||
}
|
||||
|
||||
public void invert() {
|
||||
inverted = true;
|
||||
}
|
||||
public void parenOpen() {
|
||||
inParen = true;
|
||||
beginToken();
|
||||
}
|
||||
|
||||
public void parenClose() {
|
||||
inParen = false;
|
||||
inverted = false;
|
||||
}
|
||||
|
||||
private void beginToken() {
|
||||
variants.add(new ArrayList<>());
|
||||
if (inverted)
|
||||
variants.getLast().addAll(allTags);
|
||||
}
|
||||
|
||||
public void addToken(String token) {
|
||||
if (!inParen) beginToken();
|
||||
|
||||
List<String> tokensExpanded;
|
||||
if (token.endsWith("*")) {
|
||||
String prefix = token.substring(0, token.length() - 1);
|
||||
tokensExpanded = allTags.stream().filter(str -> prefix.isEmpty() || str.startsWith(prefix)).toList();
|
||||
}
|
||||
else {
|
||||
tokensExpanded = List.of(token);
|
||||
}
|
||||
|
||||
if (inverted) {
|
||||
variants.getLast().removeAll(tokensExpanded);
|
||||
}
|
||||
else {
|
||||
variants.getLast().addAll(tokensExpanded);
|
||||
}
|
||||
|
||||
if (!inParen) {
|
||||
inverted = false;
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,9 @@
|
||||
package nu.marginalia.language.pos;
|
||||
|
||||
public enum PosPatternCategory {
|
||||
NAME,
|
||||
NOUN,
|
||||
KEYWORD,
|
||||
TITLE,
|
||||
SUBJECT_SUFFIX
|
||||
}
|
@@ -0,0 +1,130 @@
|
||||
package nu.marginalia.language.pos;
|
||||
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PosTagger {
|
||||
private final RDRPOSTagger rdrposTagger;
|
||||
public final Map<String, Integer> tagDict;
|
||||
public final List<String> tagNames;
|
||||
private final String isoCode;
|
||||
|
||||
public PosTagger(String isoCode, Path dictFilePath, Path rdrFilePath) throws IOException {
|
||||
this.isoCode = isoCode;
|
||||
rdrposTagger = new RDRPOSTagger(dictFilePath, rdrFilePath);
|
||||
|
||||
List<String> tagNames = new ArrayList<>();
|
||||
HashMap<String, Integer> tags = new HashMap<>();
|
||||
try (var linesStream = Files.lines(dictFilePath)) {
|
||||
linesStream.map(line -> StringUtils.split(line, " ", 2))
|
||||
.filter(line -> line.length==2)
|
||||
.map(line -> line[1])
|
||||
.distinct()
|
||||
.forEach(tag -> {
|
||||
tags.putIfAbsent(tag, tagNames.size());
|
||||
tagNames.add(tag);
|
||||
});
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
this.tagDict = Collections.unmodifiableMap(tags);
|
||||
this.tagNames = Collections.unmodifiableList(tagNames);
|
||||
}
|
||||
|
||||
/** Alternate constructor for tests */
|
||||
public PosTagger(String isoCode, List<String> tags) {
|
||||
this.isoCode = isoCode;
|
||||
this.tagNames = tags.stream().distinct().toList();
|
||||
this.tagDict = tags.stream().distinct().collect(Collectors.toMap(Function.identity(), tagNames::indexOf, (a,b)->a));
|
||||
this.rdrposTagger = null;
|
||||
}
|
||||
|
||||
public long[] tagSentence(String[] words) {
|
||||
String[] tags;
|
||||
|
||||
// Unclear if this is necessary, but the library does have a different function for tagging English
|
||||
if ("en".equalsIgnoreCase(isoCode)) {
|
||||
tags = rdrposTagger.tagsForEnSentence(words);
|
||||
}
|
||||
else {
|
||||
tags = rdrposTagger.tagSentence(words);
|
||||
}
|
||||
|
||||
// Encode the tags as a bit mask. These will just have one (or zero) bits set
|
||||
// but will match against more complex masks
|
||||
|
||||
long[] encodedTags = new long[tags.length];
|
||||
for (int i = 0; i < encodedTags.length; i++) {
|
||||
encodedTags[i] = encodeTagName(tags[i]);
|
||||
}
|
||||
|
||||
return encodedTags;
|
||||
}
|
||||
|
||||
public long encodeTagName(String tagName) {
|
||||
Integer tag = tagDict.get(tagName);
|
||||
if (tag == null) {
|
||||
return 0L;
|
||||
}
|
||||
return 1L << tag;
|
||||
}
|
||||
|
||||
public long encodeTagNames(List<String> tagNames) {
|
||||
long ret = 0;
|
||||
for (String tagName : tagNames) {
|
||||
ret |= encodeTagName(tagName);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public String decodeTagName(long encodedTag) {
|
||||
if (encodedTag == 0)
|
||||
return "?";
|
||||
return tagName(Long.numberOfTrailingZeros(encodedTag));
|
||||
}
|
||||
|
||||
public String tagName(int tagId) {
|
||||
if (tagId < 0 || tagId >= tagNames.size())
|
||||
return "?";
|
||||
return tagNames.get(tagId);
|
||||
}
|
||||
|
||||
public OptionalInt tagId(String tagName) {
|
||||
Integer id = tagDict.get(tagName);
|
||||
if (id == null)
|
||||
return OptionalInt.empty();
|
||||
return OptionalInt.of(id);
|
||||
}
|
||||
|
||||
public List<String> tags() {
|
||||
var ret = new ArrayList<>(tagDict.keySet());
|
||||
ret.sort(Comparator.naturalOrder());
|
||||
return ret;
|
||||
}
|
||||
|
||||
public IntList tagIdsForPrefix(String tagNamePrefix) {
|
||||
IntArrayList ret = new IntArrayList();
|
||||
tagDict.entrySet().stream()
|
||||
.filter(tag -> tag.getKey().startsWith(tagNamePrefix))
|
||||
.mapToInt(Map.Entry::getValue)
|
||||
.forEach(ret::add);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PosTaggingData{ tags=" + tagDict + '}';
|
||||
}
|
||||
}
|
@@ -1,17 +1,20 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.sentence.tag.HtmlStringTagger;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTaggedString;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
@@ -29,12 +32,11 @@ import java.util.*;
|
||||
*/
|
||||
public class SentenceExtractor {
|
||||
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private SentenceDetectorME sentenceDetector;
|
||||
private static RDRPOSTagger rdrposTagger;
|
||||
|
||||
private static NgramLexicon ngramLexicon = null;
|
||||
|
||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||
|
||||
private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner();
|
||||
@@ -46,8 +48,10 @@ public class SentenceExtractor {
|
||||
static final int MAX_SENTENCE_COUNT = 1000;
|
||||
|
||||
@Inject
|
||||
public SentenceExtractor(LanguageModels models)
|
||||
public SentenceExtractor(LanguageConfiguration languageConfiguration, LanguageModels models)
|
||||
{
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
|
||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||
var sentenceModel = new SentenceModel(modelIn);
|
||||
sentenceDetector = new SentenceDetectorME(sentenceModel);
|
||||
@@ -61,21 +65,14 @@ public class SentenceExtractor {
|
||||
if (ngramLexicon == null) {
|
||||
ngramLexicon = new NgramLexicon(models);
|
||||
}
|
||||
|
||||
if (rdrposTagger == null) {
|
||||
try {
|
||||
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
|
||||
} catch (Exception ex) {
|
||||
throw new IllegalStateException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(Document doc) {
|
||||
public DocumentLanguageData extractSentences(Document doc) throws UnsupportedLanguageException {
|
||||
var language = languageConfiguration.identifyLanguage(doc).orElseThrow(UnsupportedLanguageException::new);
|
||||
|
||||
final List<DocumentSentence> textSentences = new ArrayList<>();
|
||||
|
||||
final List<HtmlTaggedString> taggedStrings = HtmlStringTagger.tagDocumentStrings(doc);
|
||||
|
||||
final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum();
|
||||
@@ -85,7 +82,7 @@ public class SentenceExtractor {
|
||||
String text = taggedString.string();
|
||||
|
||||
textSentences.addAll(
|
||||
extractSentencesFromString(text, taggedString.tags())
|
||||
extractSentencesFromString(language, text, taggedString.tags())
|
||||
);
|
||||
|
||||
if (documentText.isEmpty()) {
|
||||
@@ -96,32 +93,62 @@ public class SentenceExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
return new DocumentLanguageData(textSentences, documentText.toString());
|
||||
return new DocumentLanguageData(language, textSentences, documentText.toString());
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class));
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
|
||||
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
|
||||
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
|
||||
|
||||
var textSentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
|
||||
var titleSentences = extractSentencesFromString(language, title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
|
||||
|
||||
List<DocumentSentence> combined = new ArrayList<>(textSentences.size() + titleSentences.size());
|
||||
combined.addAll(titleSentences);
|
||||
combined.addAll(textSentences);
|
||||
|
||||
return new DocumentLanguageData(
|
||||
language,
|
||||
combined,
|
||||
text);
|
||||
}
|
||||
|
||||
public DocumentSentence extractSentence(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
|
||||
public DocumentLanguageData extractSentences(String text) {
|
||||
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
|
||||
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
|
||||
|
||||
var sentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
return new DocumentLanguageData(language, sentences, text);
|
||||
}
|
||||
|
||||
|
||||
public DocumentSentence extractSentence(LanguageDefinition language,
|
||||
String text,
|
||||
EnumSet<HtmlTag> htmlTags) {
|
||||
final Stemmer stemmer = language.stemmer();
|
||||
|
||||
var wordsAndSeps = new SentenceSegmentSplitter(language).splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
|
||||
String[] words = wordsAndSeps.words();
|
||||
BitSet seps = wordsAndSeps.separators();
|
||||
String[] lc = new String[words.length];
|
||||
String[] stemmed = new String[words.length];
|
||||
long[] posTags = language.posTagSentence(words);
|
||||
|
||||
BitSet isCapitalized = new BitSet(words.length);
|
||||
BitSet isAllCaps = new BitSet(words.length);
|
||||
BitSet includeInStemming;
|
||||
|
||||
PosPattern inclusionPattern = stemmer.inclusionPatten();
|
||||
if (inclusionPattern == null) {
|
||||
includeInStemming = new BitSet(lc.length);
|
||||
includeInStemming.set(0, lc.length);
|
||||
}
|
||||
else {
|
||||
includeInStemming = inclusionPattern.matchTagPattern(posTags);
|
||||
}
|
||||
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
lc[i] = stripPossessive(words[i].toLowerCase());
|
||||
@@ -134,7 +161,7 @@ public class SentenceExtractor {
|
||||
}
|
||||
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(lc[i]);
|
||||
stemmed[i] = stemmer.stem(lc[i]);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
stemmed[i] = "NN"; // ???
|
||||
@@ -144,16 +171,18 @@ public class SentenceExtractor {
|
||||
return new DocumentSentence(
|
||||
seps,
|
||||
lc,
|
||||
rdrposTagger.tagsForEnSentence(words),
|
||||
posTags,
|
||||
stemmed,
|
||||
htmlTags,
|
||||
isCapitalized,
|
||||
isAllCaps
|
||||
isAllCaps,
|
||||
includeInStemming
|
||||
);
|
||||
}
|
||||
|
||||
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
String[] sentences;
|
||||
public List<DocumentSentence> extractSentencesFromString(LanguageDefinition language, String text, EnumSet<HtmlTag> htmlTags) {
|
||||
final Stemmer stemmer = language.stemmer();
|
||||
|
||||
|
||||
// Safety net against malformed data DOS attacks,
|
||||
// found 5+ MB <p>-tags in the wild that just break
|
||||
@@ -167,7 +196,7 @@ public class SentenceExtractor {
|
||||
text = normalizeSpaces(text);
|
||||
|
||||
// Split into sentences
|
||||
|
||||
String[] sentences;
|
||||
try {
|
||||
sentences = sentenceDetector.sentDetect(text);
|
||||
}
|
||||
@@ -189,22 +218,34 @@ public class SentenceExtractor {
|
||||
|
||||
List<DocumentSentence> ret = new ArrayList<>(sentences.length);
|
||||
|
||||
SentenceSegmentSplitter sentenceSegmentSplitter = new SentenceSegmentSplitter(language);
|
||||
|
||||
if (isNaturalLanguage) {
|
||||
// Natural language text; do POS tagging and stemming
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = rdrposTagger.tagsForEnSentence(tokens);
|
||||
var posTags = language.posTagSentence(tokens);
|
||||
var tokensLc = new String[tokens.length];
|
||||
var stemmed = new String[tokens.length];
|
||||
|
||||
BitSet isCapitalized = new BitSet(tokens.length);
|
||||
BitSet isAllCaps = new BitSet(tokens.length);
|
||||
BitSet includeInStemming;
|
||||
|
||||
PosPattern inclusionPattern = stemmer.inclusionPatten();
|
||||
if (inclusionPattern == null) {
|
||||
includeInStemming = new BitSet(tokens.length);
|
||||
includeInStemming.set(0, tokens.length);
|
||||
}
|
||||
else {
|
||||
includeInStemming = inclusionPattern.matchTagPattern(posTags);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
if (!tokens[i].isEmpty() && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
isCapitalized.set(i);
|
||||
}
|
||||
if (StringUtils.isAllUpperCase(tokens[i])) {
|
||||
@@ -221,13 +262,13 @@ public class SentenceExtractor {
|
||||
}
|
||||
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(tokens[i]);
|
||||
stemmed[i] = stemmer.stem(tokens[i]);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
stemmed[i] = "NN"; // ???
|
||||
}
|
||||
}
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps));
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps, includeInStemming));
|
||||
}
|
||||
}
|
||||
else {
|
||||
@@ -235,21 +276,22 @@ public class SentenceExtractor {
|
||||
// as this is not likely to be useful
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = new String[tokens.length];
|
||||
Arrays.fill(posTags, "X"); // Placeholder POS tag
|
||||
var posTags = new long[tokens.length];
|
||||
var tokensLc = new String[tokens.length];
|
||||
var stemmed = new String[tokens.length];
|
||||
|
||||
BitSet isCapitalized = new BitSet(tokens.length);
|
||||
BitSet isAllCaps = new BitSet(tokens.length);
|
||||
BitSet includeInStemming = new BitSet(tokens.length);
|
||||
includeInStemming.set(0, tokens.length);
|
||||
|
||||
for (int i = 0; i < tokensLc.length; i++) {
|
||||
var originalVal = tokens[i];
|
||||
|
||||
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
if (!tokens[i].isEmpty() && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
isCapitalized.set(i);
|
||||
}
|
||||
if (StringUtils.isAllUpperCase(tokens[i])) {
|
||||
@@ -264,7 +306,7 @@ public class SentenceExtractor {
|
||||
stemmed[i] = tokensLc[i]; // we don't stem non-language words
|
||||
}
|
||||
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized));
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized, includeInStemming));
|
||||
}
|
||||
|
||||
}
|
@@ -2,7 +2,8 @@ package nu.marginalia.language.sentence;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
@@ -13,10 +14,11 @@ import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;
|
||||
|
||||
public class SentenceSegmentSplitter {
|
||||
|
||||
private final UnicodeNormalization unicodeNormalization;
|
||||
|
||||
public record SeparatedSentence(String[] words, BitSet separators) { }
|
||||
|
||||
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
|
||||
|
||||
private static final Pattern wordBreakPattern;
|
||||
|
||||
static {
|
||||
@@ -31,13 +33,17 @@ public class SentenceSegmentSplitter {
|
||||
}
|
||||
}
|
||||
|
||||
SentenceSegmentSplitter(LanguageDefinition languageDefinition) {
|
||||
this.unicodeNormalization = languageDefinition.unicodeNormalization();
|
||||
}
|
||||
|
||||
/** Split a sentence into words and separators.
|
||||
*
|
||||
* @param segment The sentence to split
|
||||
* @return A list of words and separators
|
||||
*/
|
||||
public static SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = AsciiFlattener.flattenUnicode(segment);
|
||||
public SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = unicodeNormalization.flattenUnicode(segment);
|
||||
|
||||
var matcher = wordBreakPattern.matcher(flatSegment);
|
||||
|
@@ -3,14 +3,15 @@ package nu.marginalia.language.sentence;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class ThreadLocalSentenceExtractorProvider {
|
||||
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
|
||||
|
||||
@Inject
|
||||
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
|
||||
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
|
||||
public ThreadLocalSentenceExtractorProvider(LanguageConfiguration languageConfiguration, LanguageModels languageModels) {
|
||||
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageConfiguration, languageModels));
|
||||
}
|
||||
|
||||
public SentenceExtractor get() {
|
@@ -0,0 +1,68 @@
|
||||
package nu.marginalia.language.stemming;
|
||||
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import opennlp.tools.stemmer.snowball.SnowballStemmer;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public sealed interface Stemmer {
|
||||
String stem(String input);
|
||||
@Nullable PosPattern inclusionPatten();
|
||||
|
||||
final class Porter implements Stemmer {
|
||||
private static final ca.rmen.porterstemmer.PorterStemmer porterStemmerImpl = new ca.rmen.porterstemmer.PorterStemmer();
|
||||
@Nullable
|
||||
private final PosPattern inclusionPattern;
|
||||
|
||||
public Porter(@Nullable PosPattern inclusionPattern) {
|
||||
this.inclusionPattern = inclusionPattern;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public PosPattern inclusionPatten() {
|
||||
return inclusionPattern;
|
||||
}
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
return porterStemmerImpl.stemWord(input);
|
||||
}
|
||||
}
|
||||
|
||||
final class Snowball implements Stemmer {
|
||||
private final SnowballStemmer snowballStemmer;
|
||||
@Nullable
|
||||
private final PosPattern inclusionPattern;
|
||||
|
||||
public Snowball(String algorithmName, @Nullable PosPattern inclusionPattern) {
|
||||
this.inclusionPattern = inclusionPattern;
|
||||
|
||||
SnowballStemmer.ALGORITHM algorithm = SnowballStemmer.ALGORITHM.valueOf(algorithmName.toUpperCase());
|
||||
snowballStemmer = new SnowballStemmer(algorithm);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public PosPattern inclusionPatten() {
|
||||
return inclusionPattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
// Snowball impl declares return value as CharSequence,
|
||||
// but in practice always returns a String
|
||||
return (String) snowballStemmer.stem(input);
|
||||
}
|
||||
}
|
||||
|
||||
final class NoOpStemmer implements Stemmer {
|
||||
|
||||
@Nullable
|
||||
public PosPattern inclusionPatten() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
return input;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.*;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
@@ -11,7 +10,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
public class NgramExtractorMain {
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
@@ -112,50 +111,45 @@ public class NgramExtractorMain {
|
||||
|
||||
var orderedHasher = HasherGroup.ordered();
|
||||
|
||||
var pool = new SimpleBlockingThreadPool("ngram-extractor",
|
||||
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
|
||||
32
|
||||
);
|
||||
try (var pool = new ForkJoinPool(Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32))) {
|
||||
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submit(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submit(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
}, p -> true);
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
pool.shutDown();
|
||||
pool.awaitTermination(10, TimeUnit.DAYS);
|
||||
}, p -> true);
|
||||
}
|
||||
|
||||
lexicon.saveCounts(countsOutputFile);
|
||||
}
|
@@ -5,16 +5,19 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** Dictionary with term frequency information for (stemmed) words.
|
||||
*
|
||||
@@ -38,15 +41,23 @@ public class TermFrequencyDict {
|
||||
}
|
||||
|
||||
private static Long2IntOpenHashMap load(Path file) throws IOException {
|
||||
try (LongArray array = LongArrayFactory.mmapForReadingConfined(file)) {
|
||||
try (Arena arena = Arena.ofConfined();
|
||||
FileChannel fileChannel = (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ)) {
|
||||
|
||||
int size = (int) Files.size(file) / 16;
|
||||
long fileSizeBytes = Files.size(file);
|
||||
MemorySegment mappedFile = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSizeBytes, arena);
|
||||
|
||||
int size = (int) fileSizeBytes / 16;
|
||||
var ret = new Long2IntOpenHashMap(size, 0.5f);
|
||||
|
||||
ret.defaultReturnValue(0);
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
ret.put(array.get(2 * i), (int) array.get(2 * i + 1));
|
||||
|
||||
long key = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i);
|
||||
long val = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i + 1);
|
||||
|
||||
ret.put(key, (int) val);
|
||||
}
|
||||
|
||||
return ret;
|
@@ -0,0 +1,109 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
id ID #REQUIRED
|
||||
md5 CDATA #REQUIRED
|
||||
path CDATA #REQUIRED
|
||||
href CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ATTLIST language
|
||||
isoCode ID #REQUIRED
|
||||
name CDATA #REQUIRED
|
||||
display (rtl|ltr) #REQUIRED
|
||||
disabled (true|false) "false"
|
||||
>
|
||||
|
||||
<!ELEMENT unicodeNormalization EMPTY>
|
||||
<!ATTLIST unicodeNormalization
|
||||
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT stemmer (pospattern?)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
variant CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT keywordHash (#PCDATA)>
|
||||
<!ATTLIST keywordHash
|
||||
algorithm (asciish|utf8) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT rdrTagger EMPTY>
|
||||
<!ATTLIST rdrTagger
|
||||
dictId IDREF #REQUIRED
|
||||
rdrId IDREF #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT ngrams (pospattern*)>
|
||||
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
|
||||
|
||||
<!ELEMENT pospattern (#PCDATA)>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
|
||||
]>
|
||||
|
||||
<languages>
|
||||
<language isoCode="en" name="English" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="porter">
|
||||
<pospattern>!(IN TO CC DT)</pospattern>
|
||||
</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="maximal-latin" />
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
<ngrams type="name">
|
||||
<pospattern>NNP*</pospattern>
|
||||
<pospattern>NNP* NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="noun">
|
||||
<pospattern>VBG</pospattern>
|
||||
<pospattern>RB VBG</pospattern>
|
||||
<pospattern>(NNP* JJ)</pospattern>
|
||||
<pospattern>(NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="subject-suffix">
|
||||
<pospattern>(VBD VBZ)</pospattern>
|
||||
<pospattern>MD VB</pospattern>
|
||||
<pospattern>VBZ DT</pospattern>
|
||||
<pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="title">
|
||||
<pospattern>!(CC IN DT TO)</pospattern>
|
||||
<pospattern>!CC !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * * !(IN DT TO)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="keyword">
|
||||
<!-- length = 1 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
|
||||
<!-- length = 2 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 3 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 4 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
</language>
|
||||
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
|
||||
</languages>
|
@@ -0,0 +1,135 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
id ID #REQUIRED
|
||||
md5 CDATA #REQUIRED
|
||||
path CDATA #REQUIRED
|
||||
href CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ATTLIST language
|
||||
isoCode ID #REQUIRED
|
||||
name CDATA #REQUIRED
|
||||
display (rtl|ltr) #REQUIRED
|
||||
disabled (true|false) "false"
|
||||
>
|
||||
|
||||
<!ELEMENT unicodeNormalization EMPTY>
|
||||
<!ATTLIST unicodeNormalization
|
||||
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT stemmer (pospattern?)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
variant CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT keywordHash (#PCDATA)>
|
||||
<!ATTLIST keywordHash
|
||||
algorithm (asciish|utf8) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT rdrTagger EMPTY>
|
||||
<!ATTLIST rdrTagger
|
||||
dictId IDREF #REQUIRED
|
||||
rdrId IDREF #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT ngrams (pospattern*)>
|
||||
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
|
||||
|
||||
<!ELEMENT pospattern (#PCDATA)>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
|
||||
]>
|
||||
|
||||
<languages>
|
||||
<language isoCode="en" name="English" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="porter">
|
||||
<pospattern>!(IN TO CC DT)</pospattern>
|
||||
</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="maximal-latin" />
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
<ngrams type="name">
|
||||
<pospattern>NNP*</pospattern>
|
||||
<pospattern>NNP* NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="noun">
|
||||
<pospattern>VBG</pospattern>
|
||||
<pospattern>RB VBG</pospattern>
|
||||
<pospattern>(NNP* JJ)</pospattern>
|
||||
<pospattern>(NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="subject-suffix">
|
||||
<pospattern>(VBD VBZ)</pospattern>
|
||||
<pospattern>MD VB</pospattern>
|
||||
<pospattern>VBZ DT</pospattern>
|
||||
<pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="title">
|
||||
<pospattern>!(CC IN DT TO)</pospattern>
|
||||
<pospattern>!CC !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * * !(IN DT TO)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="keyword">
|
||||
<!-- length = 1 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
|
||||
<!-- length = 2 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 3 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 4 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
|
||||
</language>
|
||||
<language isoCode="sv" name="Swedish" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="SWEDISH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="e-accents" />
|
||||
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
|
||||
<ngrams type="name">
|
||||
<pospattern>PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN PROPN</pospattern>
|
||||
</ngrams>
|
||||
</language>
|
||||
<language isoCode="fr" name="French" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
</language>
|
||||
<language isoCode="de" name="German" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="GERMAN" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="german" />
|
||||
</language>
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
<resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />
|
||||
<resource id="pos-rdr-sv" md5="" path="rdr/Swedish.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.RDR" />
|
||||
|
||||
</languages>
|
@@ -0,0 +1,277 @@
|
||||
@import it.unimi.dsi.fastutil.ints.IntList
|
||||
@import nu.marginalia.language.model.WordRep
|
||||
@import nu.marginalia.language.model.DocumentSentence
|
||||
@import nu.marginalia.language.model.LanguageDefinition
|
||||
@import java.util.*
|
||||
@import java.util.stream.Collectors
|
||||
@import java.util.stream.IntStream
|
||||
|
||||
@param String textSample
|
||||
@param LanguageDefinition language
|
||||
@param List<DocumentSentence> sentences
|
||||
@param Map<Long, String> tagColors
|
||||
@param Collection<WordRep> tfIdfReps
|
||||
@param Collection<WordRep> titleReps
|
||||
@param Collection<WordRep> nameLikeReps
|
||||
@param Collection<WordRep> subjectLikeReps
|
||||
@param Collection<String> artifacts
|
||||
@param Collection<String> importantWords
|
||||
@param Map<String, IntList> positionedWords
|
||||
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>NLP Debug Tool</title>
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
||||
<style>
|
||||
.sentence-boundary {
|
||||
border-left: 3px solid #3b82f6;
|
||||
}
|
||||
ruby rt {
|
||||
font-size: 0.65em;
|
||||
color: #6b7280;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body class="bg-gray-50 min-h-screen">
|
||||
<div class="container mx-auto px-4 py-8 max-w-6xl">
|
||||
<!-- Header -->
|
||||
<div class="mb-8">
|
||||
<h1 class="text-3xl font-bold text-gray-900 mb-2">
|
||||
<i class="fas fa-microscope text-blue-600 mr-3"></i>
|
||||
Language Processing Debug Tool
|
||||
</h1>
|
||||
<p class="text-gray-600">Inspect and debug text processing pipeline components</p>
|
||||
</div>
|
||||
|
||||
<!-- Input Section -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200 mb-6">
|
||||
<form method="post">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900 mb-3">
|
||||
<i class="fas fa-edit text-green-600 mr-2"></i>
|
||||
Input Text
|
||||
</h2>
|
||||
|
||||
<textarea name="textSample"
|
||||
class="w-full p-4 border border-gray-300 rounded-md focus:ring-2 focus:ring-blue-500 focus:border-blue-500 resize-none"
|
||||
rows="4"
|
||||
placeholder="Enter your text here to analyze...">${textSample}</textarea>
|
||||
<div class="flex justify-between items-center mt-3">
|
||||
<button class="px-4 py-2 bg-blue-600 text-white rounded-md hover:bg-blue-700 transition-colors">
|
||||
<i class="fas fa-cog mr-2"></i>Analyze
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<!-- Results Grid -->
|
||||
<div class="space-y-6">
|
||||
|
||||
<!-- Sentence Breakdown with POS Tags -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900">
|
||||
<i class="fas fa-list-ol text-purple-600 mr-2"></i>
|
||||
Sentence Breakdown & POS Tags
|
||||
</h2>
|
||||
@if (language != null)
|
||||
<div class="text-sm text-gray-500 mt-1">Auto-detected: ${language.displayName()} (${language.isoCode()})</div>
|
||||
@endif
|
||||
</div>
|
||||
@if (sentences != null)
|
||||
@for (DocumentSentence sentence : sentences)
|
||||
<div class="p-4 space-y-4">
|
||||
<div class="sentence-boundary pl-4 py-4 rounded">
|
||||
@for (int pos : IntStream.range(0, sentence.length()).toArray())
|
||||
<ruby class="p-4">
|
||||
@if (language.hasPosParsing())
|
||||
<span class="text-xl font-serif ${tagColors.get(sentence.posTags[pos])}">
|
||||
${sentence.wordsLowerCase[pos]}
|
||||
</span>
|
||||
<rt>
|
||||
${language.decodePosTagName(sentence.posTags[pos])}
|
||||
|
||||
@if (sentence.isAllCaps(pos))
|
||||
<i class="fa-solid fa-angles-up"></i>
|
||||
@elseif (sentence.isCapitalized(pos))
|
||||
<i class="fa-solid fa-arrow-up"></i>
|
||||
@endif
|
||||
</rt>
|
||||
@else <!-- pos tags disabled -->
|
||||
<span class="text-xl font-serif">
|
||||
${sentence.wordsLowerCase[pos]}
|
||||
</span>
|
||||
<rt>
|
||||
@if (sentence.isAllCaps(pos))
|
||||
<i class="fa-solid fa-angles-up"></i>
|
||||
@elseif (sentence.isCapitalized(pos))
|
||||
<i class="fa-solid fa-arrow-up"></i>
|
||||
@endif
|
||||
</rt>
|
||||
@endif
|
||||
</ruby>
|
||||
@if (sentence.isSeparatorComma(pos))
|
||||
<i class="fa-regular fa-circle"></i>
|
||||
@endif
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endfor
|
||||
@endif
|
||||
</div>
|
||||
|
||||
<!-- Keywords & N-grams -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900">
|
||||
<i class="fas fa-key text-indigo-600 mr-2"></i>
|
||||
Keywords & N-grams
|
||||
</h2>
|
||||
</div>
|
||||
<div class="p-4">
|
||||
<div class="grid grid-cols-1 md:grid-cols-3 gap-6">
|
||||
<!-- Keywords -->
|
||||
@if (tfIdfReps != null && !tfIdfReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Keywords (TF-IDF)
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : tfIdfReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
@if (rep.length > 1)
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (nameLikeReps != null && !nameLikeReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Name-Like
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : nameLikeReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
@if (rep.length > 1)
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (subjectLikeReps != null && !subjectLikeReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Subject-Like
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : subjectLikeReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
@if (rep.length > 1)
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (titleReps != null && !titleReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Title
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : titleReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
@if (rep.length > 1)
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (artifacts != null && !artifacts.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Artifacts
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : artifacts)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
|
||||
@if (importantWords != null && !importantWords.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Important Words
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : importantWords)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Full simulation outcome from keyword extraction -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900">
|
||||
<i class="fas fa-list-ol text-purple-600 mr-2"></i>
|
||||
Outcome
|
||||
</h2>
|
||||
</div>
|
||||
<div class="p-4">
|
||||
@if (positionedWords != null && !positionedWords.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Positioned Words
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : positionedWords.keySet())
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${positionedWords.get(word).stream().map(Object::toString).collect(Collectors.joining(", "))}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
@@ -1,19 +1,23 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
@@ -23,10 +27,16 @@ import java.util.Set;
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
|
||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
static SentenceExtractor se;
|
||||
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||
public void testKeyboards2() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
@@ -44,7 +54,7 @@ class DocumentKeywordExtractorTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testMadonna() throws IOException, URISyntaxException {
|
||||
public void testMadonna() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
@@ -56,19 +66,19 @@ class DocumentKeywordExtractorTest {
|
||||
new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
|
||||
);
|
||||
|
||||
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
|
||||
var keywordsBuilt = keywords.build();
|
||||
|
||||
Map<String, Byte> flags = new HashMap<>();
|
||||
Map<String, CodedSequence> positions = new HashMap<>();
|
||||
|
||||
for (int i = 0; i < keywordsBuilt.size(); i++) {
|
||||
String keyword = keywordsBuilt.keywords.get(i);
|
||||
byte metadata = keywordsBuilt.metadata[i]
|
||||
String keyword = keywordsBuilt.keywords().get(i);
|
||||
byte metadata = keywordsBuilt.metadata()[i]
|
||||
;
|
||||
|
||||
if (Set.of("dirty", "blues").contains(keyword)) {
|
||||
flags.put(keyword, metadata);
|
||||
positions.put(keyword, keywordsBuilt.positions.get(i));
|
||||
positions.put(keyword, keywordsBuilt.positions().get(i));
|
||||
|
||||
}
|
||||
}
|
||||
@@ -81,17 +91,4 @@ class DocumentKeywordExtractorTest {
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSpam() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
}
|
||||
}
|
@@ -5,14 +5,23 @@ import gnu.trove.list.array.TIntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
@@ -20,8 +29,21 @@ import java.util.List;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class DocumentPositionMapperTest {
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
private static LanguageDefinition english;
|
||||
private DocumentPositionMapper positionMapper;
|
||||
static SentenceExtractor se;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
|
||||
english = config.getLanguage("en");
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
positionMapper = new DocumentPositionMapper();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWordPattern() {
|
||||
@@ -43,8 +65,8 @@ class DocumentPositionMapperTest {
|
||||
@Test
|
||||
public void testBasic() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
DocumentLanguageData dld = new DocumentLanguageData(
|
||||
se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
|
||||
DocumentLanguageData dld = new DocumentLanguageData(english,
|
||||
se.extractSentencesFromString(english, "I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
|
||||
"I am a teapot"
|
||||
);
|
||||
|
||||
@@ -72,7 +94,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksSingleWord1Rep() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 1 });
|
||||
|
||||
@@ -93,7 +115,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksSingleWord2Reps() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
|
||||
|
||||
@@ -105,7 +127,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
||||
@@ -121,7 +143,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksTwoWords2Reps() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 4 });
|
||||
|
||||
@@ -134,7 +156,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
||||
@@ -151,8 +173,8 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksTwoSent1Word1Rep() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences1 = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences2 = se.extractSentencesFromString(english, "Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences1.size());
|
||||
assertEquals(1, sentences2.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 1, 1 });
|
||||
@@ -170,7 +192,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user