mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
339 Commits
deploy-029
...
f1a71e9033
Author | SHA1 | Date | |
---|---|---|---|
|
f1a71e9033 | ||
|
7b525918c9 | ||
|
0f3aede66f | ||
|
88236f3836 | ||
|
ad31a22fbb | ||
|
2785ae8241 | ||
|
1ed1f2f299 | ||
|
b7d3b67a1d | ||
|
d28010b7e6 | ||
|
2689bd9eaa | ||
|
f6d5d7f196 | ||
|
abf1186fa7 | ||
|
94a77ebddf | ||
|
4e2f76a477 | ||
|
4cd1834938 | ||
|
5cbbea67ed | ||
|
b688f15550 | ||
|
f55af8ef48 | ||
|
adc815e282 | ||
|
ca8455e049 | ||
|
4ea724d2cb | ||
|
40600e7297 | ||
|
7795742538 | ||
|
82d33ce69b | ||
|
e49cc5c244 | ||
|
0af389ad93 | ||
|
48791f56bd | ||
|
be83726427 | ||
|
708caa8791 | ||
|
32394f42b9 | ||
|
b8e3445ce0 | ||
|
17a78a7b7e | ||
|
5a75dd8093 | ||
|
a9713347a0 | ||
|
4694d36ed2 | ||
|
70bdd1f51e | ||
|
187b4828e6 | ||
|
93fc14dc94 | ||
|
fbfea8539b | ||
|
0929d77247 | ||
|
db8f8c1f55 | ||
|
dcb2723386 | ||
|
00c1f495f6 | ||
|
73a923983a | ||
|
e9ed0c5669 | ||
|
5b2bec6144 | ||
|
f26bb8e2b1 | ||
|
4455495dc6 | ||
|
b84d17aa51 | ||
|
9d008390ae | ||
|
a40c2a8146 | ||
|
a3416bf48e | ||
|
ee2461d9fc | ||
|
54c91a84e3 | ||
|
a6371fc54c | ||
|
8faa9a572d | ||
|
fdce940263 | ||
|
af8a13a7fb | ||
|
9e332de6b4 | ||
|
d457bb5d44 | ||
|
c661ebb619 | ||
|
53e744398a | ||
|
1d71baf3e5 | ||
|
bb5fc0f348 | ||
|
c8f112d040 | ||
|
ae31bc8498 | ||
|
da5046c3bf | ||
|
f67257baf2 | ||
|
924fb05661 | ||
|
c231a82062 | ||
|
2c1082d7f0 | ||
|
06947bd026 | ||
|
519aebd7c6 | ||
|
42cc27586e | ||
|
360881fafd | ||
|
4c6fdf6ebe | ||
|
554de21f68 | ||
|
00194acbfe | ||
|
97dabcefaa | ||
|
cc790644d4 | ||
|
8f893ee6c0 | ||
|
938721b793 | ||
|
f68bcefc75 | ||
|
164a646af6 | ||
|
0cfd759f85 | ||
|
b53002200c | ||
|
78246b9a63 | ||
|
b552e79927 | ||
|
bffc159486 | ||
|
b8000721bd | ||
|
2ee0b0e420 | ||
|
1432fc87d7 | ||
|
ec5f32b1d8 | ||
|
edd453531e | ||
|
096496ada1 | ||
|
8ca6209260 | ||
|
673c65d3c9 | ||
|
acb9ec7b15 | ||
|
47079e05db | ||
|
c93056e77f | ||
|
6f7530e807 | ||
|
87ce4a1b52 | ||
|
52194cbe7a | ||
|
fd1ac03c78 | ||
|
5e5b86efb4 | ||
|
f332ec6191 | ||
|
c25c1af437 | ||
|
eb0c911b45 | ||
|
1979870ce4 | ||
|
0ba2ea38e1 | ||
|
d6cfbceeea | ||
|
e369d200cc | ||
|
946d64c8da | ||
|
42f043a60f | ||
|
b46f2e1407 | ||
|
18aa1b9764 | ||
|
2f3950e0d5 | ||
|
61d803869e | ||
|
df6434d177 | ||
|
59519ed7c4 | ||
|
874fc2d250 | ||
|
69e8ec0eef | ||
|
a7eb5f54e6 | ||
|
b29ba3e228 | ||
|
5fa5029c60 | ||
|
4257f60f00 | ||
|
ce221d3a0e | ||
|
f0741142a3 | ||
|
0899e4d895 | ||
|
bbf7c5a1cb | ||
|
686a40e69b | ||
|
8af254f44f | ||
|
2c21bd9287 | ||
|
f9645e2f00 | ||
|
81e311b558 | ||
|
507c09146a | ||
|
f682425594 | ||
|
de67006c4f | ||
|
eea32bb7b4 | ||
|
e976940a4e | ||
|
b564b33028 | ||
|
1cca16a58e | ||
|
70b4ed6d81 | ||
|
45dc6412c1 | ||
|
b3b95edcb5 | ||
|
338d300e1a | ||
|
fa685bf1f4 | ||
|
d79a3e2b2a | ||
|
854382b2be | ||
|
8710adbc2a | ||
|
acdf7b4785 | ||
|
b5d27c1406 | ||
|
55eb7dc116 | ||
|
f0e8bc8baf | ||
|
91a6ad2337 | ||
|
9a182b9ddb | ||
|
fefbcf15ce | ||
|
9a789bf62d | ||
|
0525303b68 | ||
|
6953d65de5 | ||
|
a7a18ced2e | ||
|
7c94c941b2 | ||
|
ea99b62356 | ||
|
3dc21d34d8 | ||
|
51912e0176 | ||
|
de1b4d5372 | ||
|
50ac926060 | ||
|
d711ee75b5 | ||
|
291ff0c4de | ||
|
2fd2710355 | ||
|
e3b957063d | ||
|
aee262e5f6 | ||
|
4a98a3c711 | ||
|
68f52ca350 | ||
|
2a2d951c2f | ||
|
379a1be074 | ||
|
827aadafcd | ||
|
aa7679d6ce | ||
|
6fe6de766d | ||
|
4245ac4c07 | ||
|
1c49a0f5ad | ||
|
9a6e5f646d | ||
|
fa92994a31 | ||
|
bc49406881 | ||
|
90325be447 | ||
|
dc89587af3 | ||
|
7b552afd6b | ||
|
73557edc67 | ||
|
83919e448a | ||
|
6f5b75b84d | ||
|
db315e2813 | ||
|
e9977e08b7 | ||
|
1df3757e5f | ||
|
ca283f9684 | ||
|
85360e61b2 | ||
|
e2ccff21bc | ||
|
c5b5b0c699 | ||
|
9a65946e22 | ||
|
1d2ab21e27 | ||
|
0610cc19ad | ||
|
a676306a7f | ||
|
8d68cd14fb | ||
|
4773c5a52b | ||
|
74bd562ae4 | ||
|
c9751287b0 | ||
|
5da24e3fc4 | ||
|
20a4e86eec | ||
|
477a184948 | ||
|
8940ce99db | ||
|
0ac0fa4dca | ||
|
942f15ef14 | ||
|
f668f33d5b | ||
|
6789975cd2 | ||
|
c3ba608776 | ||
|
733d2687fe | ||
|
f6daac8ed0 | ||
|
c2eeee4a06 | ||
|
3b0c701df4 | ||
|
c6fb2db43b | ||
|
9bc8fe05ae | ||
|
440ffcf6f8 | ||
|
b07709cc72 | ||
|
9a6acdcbe0 | ||
|
23b9b0bf1b | ||
|
749c8ed954 | ||
|
9f4b6939ca | ||
|
1d08e44e8d | ||
|
fc2e156e78 | ||
|
5e68a89e9f | ||
|
d380661307 | ||
|
cccdf5c329 | ||
|
f085b4ea12 | ||
|
e208f7d3ba | ||
|
b577085cb2 | ||
|
b9240476f6 | ||
|
8f50f86d0b | ||
|
e3b7ead7a9 | ||
|
9a845ba604 | ||
|
b9381f1603 | ||
|
6a60127267 | ||
|
e8ffcfbb19 | ||
|
caf0850f81 | ||
|
62e3bb675e | ||
|
4dc3e7da7a | ||
|
92b09883ec | ||
|
87082b4ef8 | ||
|
84d3f6087f | ||
|
f93ba371a5 | ||
|
5eec27c68d | ||
|
ab01576f91 | ||
|
054e5ccf44 | ||
|
4351ea5128 | ||
|
49cfa3a5e9 | ||
|
683854b23f | ||
|
e880fa8945 | ||
|
2482dc572e | ||
|
4589f11898 | ||
|
e43b6e610b | ||
|
4772117a1f | ||
|
3fc7ea521c | ||
|
4372f5af03 | ||
|
4ad89b6c75 | ||
|
ad0519e031 | ||
|
596ece1230 | ||
|
07b6e1585b | ||
|
cb5e2778eb | ||
|
8f5ea7896c | ||
|
76c398e0b1 | ||
|
4a94f04a8d | ||
|
df72f670d4 | ||
|
eaa22c2f5a | ||
|
7be173aeca | ||
|
36685bdca7 | ||
|
ad04057609 | ||
|
eb76ae22e2 | ||
|
4b858ab341 | ||
|
c6e3c8aa3b | ||
|
9128d3907c | ||
|
4ef16d13d4 | ||
|
838a5626ec | ||
|
6b426209c7 | ||
|
452b5731d9 | ||
|
c91cf49630 | ||
|
8503030f18 | ||
|
744f7d3ef7 | ||
|
215e12afe9 | ||
|
2716bce918 | ||
|
caf2e6fbb7 | ||
|
233f0acfb1 | ||
|
e3a4ff02e9 | ||
|
c786283ae1 | ||
|
a3f65ac0e0 | ||
|
aba1a32af0 | ||
|
c9c442345b | ||
|
2e126ba30e | ||
|
2087985f49 | ||
|
2b13ebd18b | ||
|
6d92c125fe | ||
|
f638cfa39a | ||
|
89447c12af | ||
|
c71fc46f04 | ||
|
f96874d828 | ||
|
583a84d5a0 | ||
|
f65b946448 | ||
|
3682815855 | ||
|
3a94357660 | ||
|
673b0d3de1 | ||
|
ea942bc664 | ||
|
7ed5083c54 | ||
|
08bb2c097b | ||
|
495fb325be | ||
|
05c25bbaec | ||
|
2a028b84f3 | ||
|
a091a23623 | ||
|
e8897acb45 | ||
|
b89ffcf2be | ||
|
dbcc9055b0 | ||
|
d9740557f4 | ||
|
0d6cd015fd | ||
|
c6034efcc8 | ||
|
76068014ad | ||
|
1c3ed67127 | ||
|
fc0cb6bd9a | ||
|
c2601bac78 | ||
|
f5641b72e9 | ||
|
36efe2e219 | ||
|
983fe3829e | ||
|
668c87aa86 | ||
|
9d3f9adb05 | ||
|
a43a1773f1 | ||
|
1e7a3a3c4f | ||
|
62b696b1c3 | ||
|
f1a900f383 | ||
|
700364b86d | ||
|
7e725ddaed | ||
|
120209e138 | ||
|
a771a5b6ce | ||
|
dac5b54128 | ||
|
6cfb143c15 |
9
.gitignore
vendored
9
.gitignore
vendored
@@ -7,4 +7,11 @@ build/
|
||||
lombok.config
|
||||
Dockerfile
|
||||
run
|
||||
jte-classes
|
||||
jte-classes
|
||||
.classpath
|
||||
.project
|
||||
.settings
|
||||
.factorypath
|
||||
bin/
|
||||
*.log
|
||||
*.hprof
|
||||
|
12
build.gradle
12
build.gradle
@@ -1,11 +1,12 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
id "me.champeau.jmh" version "0.7.3"
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||
id 'com.adarshr.test-logger' version '4.0.0'
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
@@ -31,7 +32,10 @@ subprojects.forEach {it ->
|
||||
jvmArgs += ['--enable-preview']
|
||||
}
|
||||
it.tasks.withType(Test).configureEach {
|
||||
jvmArgs += ['--enable-preview']
|
||||
jvmArgs += ['--enable-preview',
|
||||
'--enable-native-access=ALL-UNNAMED',
|
||||
'--sun-misc-unsafe-memory-access=allow',
|
||||
'-Dsystem.uringQueueCount=1']
|
||||
}
|
||||
|
||||
// Enable reproducible builds for the entire project
|
||||
@@ -43,8 +47,8 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion = 24
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||
jvmVersion = 25
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:25'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.5'
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.mockito
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
|
@@ -114,4 +114,7 @@ public class WmsaHome {
|
||||
}
|
||||
|
||||
|
||||
public static Path getLangugeConfig() {
|
||||
return getHomePath().resolve("conf/languages.xml");
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,141 @@
|
||||
package nu.marginalia.proxy;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Configuration for SOCKS proxy settings used by crawlers to distribute IP footprint.
|
||||
*/
|
||||
public class SocksProxyConfiguration {
|
||||
|
||||
private final boolean enabled;
|
||||
private final List<SocksProxy> proxies;
|
||||
private final ProxySelectionStrategy strategy;
|
||||
|
||||
public SocksProxyConfiguration() {
|
||||
this.enabled = Boolean.parseBoolean(System.getProperty("crawler.socksProxy.enabled", "false"));
|
||||
this.strategy = ProxySelectionStrategy.valueOf(
|
||||
System.getProperty("crawler.socksProxy.strategy", "ROUND_ROBIN")
|
||||
);
|
||||
this.proxies = parseProxies();
|
||||
}
|
||||
|
||||
private List<SocksProxy> parseProxies() {
|
||||
String proxyList = System.getProperty("crawler.socksProxy.list", "");
|
||||
if (proxyList.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
return Arrays.stream(proxyList.split(","))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.map(this::parseProxy)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private SocksProxy parseProxy(String proxyString) {
|
||||
try {
|
||||
// Expected format: "host:port" or "host:port:username:password"
|
||||
String[] parts = proxyString.split(":");
|
||||
if (parts.length < 2) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String host = parts[0];
|
||||
int port = Integer.parseInt(parts[1]);
|
||||
|
||||
if (parts.length >= 4) {
|
||||
String username = parts[2];
|
||||
String password = parts[3];
|
||||
return new SocksProxy(host, port, username, password);
|
||||
} else {
|
||||
return new SocksProxy(host, port);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEnabled() {
|
||||
return enabled && !proxies.isEmpty();
|
||||
}
|
||||
|
||||
public List<SocksProxy> getProxies() {
|
||||
return proxies;
|
||||
}
|
||||
|
||||
public ProxySelectionStrategy getStrategy() {
|
||||
return strategy;
|
||||
}
|
||||
|
||||
public enum ProxySelectionStrategy {
|
||||
ROUND_ROBIN,
|
||||
RANDOM
|
||||
}
|
||||
|
||||
public static class SocksProxy {
|
||||
private final String host;
|
||||
private final int port;
|
||||
private final String username;
|
||||
private final String password;
|
||||
|
||||
public SocksProxy(String host, int port) {
|
||||
this(host, port, null, null);
|
||||
}
|
||||
|
||||
public SocksProxy(String host, int port, String username, String password) {
|
||||
this.host = host;
|
||||
this.port = port;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
public String getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public int getPort() {
|
||||
return port;
|
||||
}
|
||||
|
||||
public String getUsername() {
|
||||
return username;
|
||||
}
|
||||
|
||||
public String getPassword() {
|
||||
return password;
|
||||
}
|
||||
|
||||
public boolean hasAuthentication() {
|
||||
return username != null && password != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (hasAuthentication()) {
|
||||
return String.format("%s:%d (auth: %s)", host, port, username);
|
||||
} else {
|
||||
return String.format("%s:%d", host, port);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
SocksProxy that = (SocksProxy) o;
|
||||
return port == that.port &&
|
||||
Objects.equals(host, that.host) &&
|
||||
Objects.equals(username, that.username) &&
|
||||
Objects.equals(password, that.password);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(host, port, username, password);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.proxy;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* Manages SOCKS proxy selection and rotation for crawler requests.
|
||||
*/
|
||||
public class SocksProxyManager {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SocksProxyManager.class);
|
||||
|
||||
private final SocksProxyConfiguration config;
|
||||
private final AtomicInteger roundRobinIndex = new AtomicInteger(0);
|
||||
|
||||
public SocksProxyManager(SocksProxyConfiguration config) {
|
||||
this.config = config;
|
||||
|
||||
if (config.isEnabled()) {
|
||||
logger.info("SOCKS proxy support enabled with {} proxies using {} strategy",
|
||||
config.getProxies().size(), config.getStrategy());
|
||||
for (SocksProxyConfiguration.SocksProxy proxy : config.getProxies()) {
|
||||
logger.info(" - {}", proxy);
|
||||
}
|
||||
} else {
|
||||
logger.info("SOCKS proxy support disabled");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects the next proxy to use based on the configured strategy.
|
||||
*/
|
||||
@Nonnull
|
||||
public SocksProxyConfiguration.SocksProxy selectProxy() {
|
||||
if (!config.isEnabled()) {
|
||||
throw new IllegalStateException("Proxies not configured");
|
||||
}
|
||||
|
||||
List<SocksProxyConfiguration.SocksProxy> proxies = config.getProxies();
|
||||
if (proxies.isEmpty()) {
|
||||
throw new IllegalStateException("Proxies not configured");
|
||||
}
|
||||
|
||||
SocksProxyConfiguration.SocksProxy selectedProxy;
|
||||
switch (config.getStrategy()) {
|
||||
case ROUND_ROBIN:
|
||||
int index = roundRobinIndex.getAndIncrement() % proxies.size();
|
||||
selectedProxy = proxies.get(index);
|
||||
break;
|
||||
case RANDOM:
|
||||
int randomIndex = ThreadLocalRandom.current().nextInt(proxies.size());
|
||||
selectedProxy = proxies.get(randomIndex);
|
||||
break;
|
||||
default:
|
||||
selectedProxy = proxies.get(0);
|
||||
break;
|
||||
}
|
||||
|
||||
return selectedProxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current proxy configuration.
|
||||
*/
|
||||
public SocksProxyConfiguration getConfiguration() {
|
||||
return config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if proxy support is enabled and proxies are available.
|
||||
*/
|
||||
public boolean isProxyEnabled() {
|
||||
return config.isEnabled() && !config.getProxies().isEmpty();
|
||||
}
|
||||
}
|
@@ -16,7 +16,7 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
||||
<Filters>
|
||||
@@ -28,7 +28,7 @@
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -38,7 +38,7 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -56,7 +56,9 @@
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="LogToFileService"/>
|
||||
<AppenderRef ref="LogToFileCrawler"/>
|
||||
<AppenderRef ref="LogToFileConverter"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -50,7 +50,7 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n</Pattern>
|
||||
@@ -64,7 +64,7 @@
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -74,7 +74,7 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -95,7 +95,9 @@
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="LogToFileService"/>
|
||||
<AppenderRef ref="LogToFileConverer"/>
|
||||
<AppenderRef ref="LogToFileCrawler"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -6,7 +6,6 @@ import com.google.inject.name.Named;
|
||||
import gnu.trove.list.TLongList;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -14,7 +13,6 @@ import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
@@ -104,7 +102,7 @@ public class DocumentDbReader {
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
FROM DOCUMENT WHERE ID = ?
|
||||
""")) {
|
||||
for (int i = 0; i < ids.size(); i++) {
|
||||
@@ -118,6 +116,7 @@ public class DocumentDbReader {
|
||||
url,
|
||||
rs.getString("TITLE"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
rs.getString("LANGUAGE"),
|
||||
rs.getDouble("QUALITY"),
|
||||
rs.getString("FORMAT"),
|
||||
rs.getInt("FEATURES"),
|
||||
|
@@ -41,8 +41,8 @@ public class DocumentDbWriter {
|
||||
public void add(List<DocdbUrlDetail> docdbUrlDetail) throws SQLException {
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
|
||||
int i = 0;
|
||||
@@ -54,15 +54,16 @@ public class DocumentDbWriter {
|
||||
|
||||
stmt.setString(3, document.title());
|
||||
stmt.setString(4, document.description());
|
||||
stmt.setInt(5, document.wordsTotal());
|
||||
stmt.setString(6, document.format());
|
||||
stmt.setInt(7, document.features());
|
||||
stmt.setLong(8, document.dataHash());
|
||||
stmt.setDouble(9, document.urlQuality());
|
||||
stmt.setString(5, document.language());
|
||||
stmt.setInt(6, document.wordsTotal());
|
||||
stmt.setString(7, document.format());
|
||||
stmt.setInt(8, document.features());
|
||||
stmt.setLong(9, document.dataHash());
|
||||
stmt.setDouble(10, document.urlQuality());
|
||||
if (document.pubYear() == null) {
|
||||
stmt.setInt(10, 0);
|
||||
stmt.setInt(11, 0);
|
||||
} else {
|
||||
stmt.setInt(10, document.pubYear());
|
||||
stmt.setInt(11, document.pubYear());
|
||||
}
|
||||
|
||||
stmt.addBatch();
|
||||
|
@@ -6,6 +6,7 @@ public record DocdbUrlDetail(long urlId,
|
||||
EdgeUrl url,
|
||||
String title,
|
||||
String description,
|
||||
String language,
|
||||
double urlQuality,
|
||||
String format,
|
||||
int features,
|
||||
|
@@ -6,6 +6,7 @@ CREATE TABLE DOCUMENT (
|
||||
STATE INT,
|
||||
TITLE TEXT NOT NULL,
|
||||
DESCRIPTION TEXT NOT NULL,
|
||||
LANGUAGE TEXT NOT NULL,
|
||||
|
||||
WORDS_TOTAL INTEGER NOT NULL,
|
||||
FORMAT TEXT NOT NULL,
|
||||
|
@@ -23,6 +23,7 @@ public class DocumentDbWriterTest {
|
||||
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
|
||||
"Test",
|
||||
"This is a test",
|
||||
"en",
|
||||
-4.,
|
||||
"XHTML",
|
||||
5,
|
||||
|
@@ -1,13 +1,12 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class EdgeDomain implements Serializable {
|
||||
public class EdgeDomain {
|
||||
|
||||
@Nonnull
|
||||
public final String subDomain;
|
||||
|
@@ -4,13 +4,12 @@ import nu.marginalia.util.QueryParams;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.Serializable;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
public class EdgeUrl implements Serializable {
|
||||
public class EdgeUrl {
|
||||
public final String proto;
|
||||
public final EdgeDomain domain;
|
||||
public final Integer port;
|
||||
|
@@ -95,18 +95,24 @@ public enum HtmlFeature {
|
||||
public static int encode(Collection<HtmlFeature> featuresAll) {
|
||||
int ret = 0;
|
||||
for (var feature : featuresAll) {
|
||||
if (feature.ordinal() >= 32) continue;
|
||||
|
||||
ret |= (1 << (feature.ordinal()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean hasFeature(int value, HtmlFeature feature) {
|
||||
return (value & (1<< feature.ordinal())) != 0;
|
||||
int ord = feature.ordinal();
|
||||
if (ord >= 32) return false;
|
||||
|
||||
return (value & (1<<ord)) != 0;
|
||||
}
|
||||
|
||||
public int getFeatureBit() {
|
||||
if (getClass().desiredAssertionStatus() && ordinal() >= 32)
|
||||
throw new IllegalStateException("Attempting to extract feature bit of " + name() + ", with ordinal " + ordinal());
|
||||
return (1<< ordinal());
|
||||
int ord = ordinal();
|
||||
if (ord >= 32) return 0;
|
||||
|
||||
return (1<<ord);
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.model.idx;
|
||||
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
|
||||
@@ -28,7 +27,6 @@ public record DocumentMetadata(int avgSentLength,
|
||||
int sets,
|
||||
int quality,
|
||||
byte flags)
|
||||
implements Serializable
|
||||
{
|
||||
|
||||
public String toString() {
|
||||
|
@@ -7,7 +7,6 @@ public enum ServiceId {
|
||||
Search("search-service"),
|
||||
Index("index-service"),
|
||||
Query("query-service"),
|
||||
Executor("executor-service"),
|
||||
|
||||
Control("control-service"),
|
||||
|
||||
|
@@ -66,7 +66,7 @@ public class NodeStatusWatcher {
|
||||
fileStorageService.createStorageBase("Crawl Data", Path.of("/storage"), nodeId, FileStorageBaseType.STORAGE);
|
||||
fileStorageService.createStorageBase("Work Area", Path.of("/work"), nodeId, FileStorageBaseType.WORK);
|
||||
|
||||
persistence.sendNewMessage("executor-service:"+nodeId,
|
||||
persistence.sendNewMessage("index-service:"+nodeId,
|
||||
null,
|
||||
null,
|
||||
"FIRST-BOOT",
|
||||
|
@@ -189,7 +189,7 @@ public class ExecutorClient {
|
||||
String uriPath = "/transfer/file/" + fileStorage.id();
|
||||
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
|
||||
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
|
||||
if (endpoints.isEmpty()) {
|
||||
throw new RuntimeException("No endpoints for node " + fileStorage.node());
|
||||
}
|
||||
|
@@ -22,7 +22,6 @@ dependencies {
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:new-domain-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
@@ -34,7 +33,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:functions:language-processing')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
@@ -2,9 +2,8 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
|
||||
@Singleton
|
||||
public class ExecutionInit {
|
||||
@@ -22,5 +21,8 @@ public class ExecutionInit {
|
||||
actorControlService.start(ExecutorActor.PROC_CRAWLER_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_INDEX_CONSTRUCTOR_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_LOADER_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER);
|
||||
actorControlService.stop(ExecutorActor.PROC_NDP_SPAWNER);
|
||||
actorControlService.stop(ExecutorActor.PROC_PING_SPAWNER);
|
||||
}
|
||||
}
|
||||
|
@@ -5,7 +5,6 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.ConverterMain;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ndp.NdpMain;
|
||||
@@ -57,7 +56,7 @@ public class ProcessSpawnerService {
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
|
||||
NDP(NdpMain.class),
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
@@ -66,6 +65,9 @@ public class ProcessSpawnerService {
|
||||
ProcessId(Class<? extends ProcessMainClass> mainClass) {
|
||||
this.mainClass = mainClass.getName();
|
||||
}
|
||||
ProcessId(String mainClassFullName) {
|
||||
this.mainClass = mainClassFullName;
|
||||
}
|
||||
|
||||
List<String> envOpts() {
|
||||
String variable = switch (this) {
|
||||
@@ -118,6 +120,17 @@ public class ProcessSpawnerService {
|
||||
args.add("-Dsystem.serviceNode=" + System.getProperty("system.serviceNode"));
|
||||
}
|
||||
|
||||
// Add SOCKS proxy properties for crawler processes
|
||||
if (System.getProperty("crawler.socksProxy.enabled") != null) {
|
||||
args.add("-Dcrawler.socksProxy.enabled=" + System.getProperty("crawler.socksProxy.enabled"));
|
||||
}
|
||||
if (System.getProperty("crawler.socksProxy.list") != null) {
|
||||
args.add("-Dcrawler.socksProxy.list=" + System.getProperty("crawler.socksProxy.list"));
|
||||
}
|
||||
if (System.getProperty("crawler.socksProxy.strategy") != null) {
|
||||
args.add("-Dcrawler.socksProxy.strategy=" + System.getProperty("crawler.socksProxy.strategy"));
|
||||
}
|
||||
|
||||
if (Boolean.getBoolean("system.profile")) {
|
||||
// add jfr options
|
||||
args.add("-XX:+FlightRecorder");
|
||||
|
@@ -5,6 +5,7 @@ import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.linkdb.LinkdbFileNames;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@@ -13,18 +14,18 @@ import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Map;
|
||||
|
||||
public class BackupService {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final ServiceHeartbeat serviceHeartbeat;
|
||||
|
||||
public enum BackupHeartbeatSteps {
|
||||
@@ -36,8 +37,10 @@ public class BackupService {
|
||||
|
||||
@Inject
|
||||
public BackupService(FileStorageService storageService,
|
||||
LanguageConfiguration languageConfiguration,
|
||||
ServiceHeartbeat serviceHeartbeat) {
|
||||
this.storageService = storageService;
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
}
|
||||
|
||||
@@ -98,22 +101,25 @@ public class BackupService {
|
||||
}
|
||||
|
||||
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
|
||||
{
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in input storage");
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException {
|
||||
Map<String, IndexJournal> journals = IndexJournal.findJournals(inputStorage, languageConfiguration.languages());
|
||||
for (IndexJournal journal : journals.values()) {
|
||||
FileUtils.copyDirectory(journal.journalDir().toFile(), backupStorage.resolve(journal.journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in backup");
|
||||
Map<String, IndexJournal> journals = IndexJournal.findJournals(backupStorage, languageConfiguration.languages());
|
||||
for (IndexJournal journal : journals.values()) {
|
||||
var journalFileName = journal.journalDir().getFileName();
|
||||
|
||||
// Ensure we delete any previous journal junk
|
||||
if (Files.exists(destStorage.resolve(journalFileName))) {
|
||||
FileUtils.deleteDirectory(destStorage.resolve(journalFileName).toFile());
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journalFileName).toFile(), destStorage.toFile());
|
||||
}
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
|
||||
}
|
||||
|
||||
private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.executor;
|
||||
package nu.marginalia.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.storage.FileStorageService;
|
@@ -1,5 +1,5 @@
|
||||
The execution subsystem is responsible for the execution of long running tasks on each
|
||||
index node. It lives in the [executor-service](../services-core/executor-service) module.
|
||||
index node. It lives in the [index-service](../services-core/index-service) module.
|
||||
|
||||
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
|
||||
which permits program state to survive crashes and reboots.
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.executor;
|
||||
package nu.marginalia.svc;
|
||||
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
@@ -1,8 +1,8 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id 'jvm-test-suite'
|
||||
id 'gg.jte.gradle' version '3.1.15'
|
||||
id 'application'
|
||||
}
|
||||
|
||||
java {
|
||||
@@ -10,22 +10,26 @@ java {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
application {
|
||||
mainClass = 'nu.marginalia.language.LanguageProcessingTool'
|
||||
applicationName = 'language-processing-tool'
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation libs.notnull
|
||||
implementation libs.bundles.jooby
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
@@ -42,3 +46,9 @@ dependencies {
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
jte {
|
||||
sourceDirectory = file('resources/ltt/jte').toPath()
|
||||
targetDirectory = file('build/classes/jte-precompiled').toPath()
|
||||
generate()
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic.dom;
|
||||
package nu.marginalia.dom;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.logic.dom;
|
||||
package nu.marginalia.dom;
|
||||
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
@@ -16,8 +16,6 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
private final TermFrequencyDict dict;
|
||||
|
||||
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
@@ -37,35 +35,54 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) {
|
||||
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
if (dld.language().hasPosParsing()) {
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper();
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, dld);
|
||||
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
var titleKeywords = new TitleKeywords(dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
|
||||
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
|
||||
|
||||
return wordsBuilder;
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
|
||||
return wordsBuilder;
|
||||
}
|
||||
else {
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
var positionMapper = new DocumentPositionMapper();
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
positionMapper.mapPositionsAndExtractSimpleKeywords(wordsBuilder, keywordMetadata, dld, linkTexts);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
return wordsBuilder;
|
||||
}
|
||||
}
|
||||
|
||||
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {
|
@@ -3,7 +3,9 @@ package nu.marginalia.keyword;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@@ -17,8 +19,6 @@ import static java.lang.Math.sqrt;
|
||||
*/
|
||||
public class DocumentPositionMapper {
|
||||
|
||||
private final KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
|
||||
public void mapPositionsAndExtractSimpleKeywords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData dld,
|
||||
@@ -38,12 +38,14 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
|
||||
|
||||
int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||
public int mapDocumentPositions(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData dld)
|
||||
|
||||
{
|
||||
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
List<SpanRecorder> spanRecorders = new ArrayList<>();
|
||||
for (var htmlTag : HtmlTag.includedTags) {
|
||||
if (!htmlTag.exclude) {
|
||||
@@ -80,7 +82,7 @@ public class DocumentPositionMapper {
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
for (var names : languageDefinition.matchGrammarPattern(sent, PosPatternCategory.NAME)) {
|
||||
WordRep rep = new WordRep(sent, names);
|
||||
byte meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
|
||||
@@ -161,11 +163,15 @@ public class DocumentPositionMapper {
|
||||
|
||||
int i = 0;
|
||||
|
||||
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
for (int run = 0; run < 15 && i < s.length(); run++) {
|
||||
int cp = s.codePointAt(i);
|
||||
|
||||
|
||||
if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
|
||||
i += Character.charCount(cp);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -175,17 +181,20 @@ public class DocumentPositionMapper {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
if (wordPartSeparator.indexOf(s.codePointAt(i)) < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
for (int run = 0; run < 10 && i < s.length(); run++) {
|
||||
int cp = s.codePointAt(i);
|
||||
|
||||
if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
|
||||
i += Character.charCount(cp);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -193,48 +202,4 @@ public class DocumentPositionMapper {
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Helper class to record spans of words */
|
||||
private static class SpanRecorder {
|
||||
private final List<DocumentKeywordsBuilder.DocumentWordSpan> spans = new ArrayList<>();
|
||||
private final HtmlTag htmlTag;
|
||||
private int start = 0;
|
||||
|
||||
public SpanRecorder(HtmlTag htmlTag) {
|
||||
this.htmlTag = htmlTag;
|
||||
}
|
||||
|
||||
public void update(DocumentSentence sentence, int pos) {
|
||||
assert pos > 0;
|
||||
|
||||
if (sentence.htmlTags.contains(htmlTag)) {
|
||||
if (start <= 0) start = pos;
|
||||
}
|
||||
else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY)
|
||||
{
|
||||
// special case for body tag, we match against no tag on the sentence
|
||||
if (start <= 0) start = pos;
|
||||
}
|
||||
else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endCurrentSpan(int pos) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
}
|
||||
}
|
@@ -6,18 +6,24 @@ import nu.marginalia.keyword.extractors.TitleKeywords;
|
||||
import nu.marginalia.keyword.extractors.UrlKeywords;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public class KeywordMetadata {
|
||||
|
||||
@Nullable
|
||||
private final TitleKeywords titleKeywords;
|
||||
@Nullable
|
||||
private final NameLikeKeywords nameLikeKeywords;
|
||||
@Nullable
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
@Nullable
|
||||
private final UrlKeywords urlKeywords;
|
||||
|
||||
public KeywordMetadata(
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords) {
|
||||
@Nullable TitleKeywords titleKeywords,
|
||||
@Nullable NameLikeKeywords nameLikeKeywords,
|
||||
@Nullable SubjectLikeKeywords subjectLikeKeywords,
|
||||
@Nullable UrlKeywords urlKeywords) {
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
@@ -32,23 +38,23 @@ public class KeywordMetadata {
|
||||
|
||||
byte flags = 0;
|
||||
|
||||
if (subjectLikeKeywords.contains(stemmed)) {
|
||||
if (subjectLikeKeywords != null && subjectLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Subjects.asBit();
|
||||
}
|
||||
|
||||
if (nameLikeKeywords.contains(stemmed)) {
|
||||
if (nameLikeKeywords != null && nameLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.NamesWords.asBit();
|
||||
}
|
||||
|
||||
if (titleKeywords.contains(stemmed)) {
|
||||
if (titleKeywords != null && titleKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Title.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsUrl(stemmed)) {
|
||||
if (urlKeywords != null && urlKeywords.containsUrl(stemmed)) {
|
||||
flags |= WordFlags.UrlPath.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsDomain(stemmed)) {
|
||||
if (urlKeywords != null && urlKeywords.containsDomain(stemmed)) {
|
||||
flags |= WordFlags.UrlDomain.asBit();
|
||||
}
|
||||
|
@@ -0,0 +1,52 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helper class to record spans of words
|
||||
*/
|
||||
class SpanRecorder {
|
||||
private final List<DocumentWordSpan> spans = new ArrayList<>();
|
||||
private final HtmlTag htmlTag;
|
||||
private int start = 0;
|
||||
|
||||
public SpanRecorder(HtmlTag htmlTag) {
|
||||
this.htmlTag = htmlTag;
|
||||
}
|
||||
|
||||
public void update(DocumentSentence sentence, int pos) {
|
||||
assert pos > 0;
|
||||
|
||||
if (sentence.htmlTags.contains(htmlTag)) {
|
||||
if (start <= 0) start = pos;
|
||||
} else if (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) {
|
||||
// special case for body tag, we match against no tag on the sentence
|
||||
if (start <= 0) start = pos;
|
||||
} else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endCurrentSpan(int pos) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, pos));
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public List<DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
}
|
@@ -2,11 +2,11 @@ package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -16,12 +16,14 @@ public class NameLikeKeywords implements WordReps {
|
||||
private final List<WordRep> nameWords;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public NameLikeKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData dld, int minCount) {
|
||||
var counts = new Object2IntOpenHashMap<String>(100);
|
||||
var instances = new HashMap<String, HashSet<WordRep>>(100);
|
||||
public NameLikeKeywords(DocumentLanguageData dld, int minCount) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<String>(100);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<String, HashSet<WordRep>>(100);
|
||||
|
||||
for (DocumentSentence sent : dld) {
|
||||
var keywords = keywordExtractor.getProperNames(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.NAME);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1 && sent.isAllCaps(span.start))
|
||||
continue;
|
@@ -1,11 +1,11 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
@@ -23,25 +23,18 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
// Greeks bearing gifts -> Greeks
|
||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||
|
||||
public SubjectLikeKeywords(KeywordExtractor keywordExtractor,
|
||||
WordsTfIdfCounts tfIdfCounts,
|
||||
public SubjectLikeKeywords(WordsTfIdfCounts tfIdfCounts,
|
||||
DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld) {
|
||||
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
|
||||
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1))
|
||||
for (WordSpan kw : languageDefinition.matchGrammarPattern(sentence, PosPatternCategory.NOUN)) {
|
||||
if (sentence.nextCommaPos(kw.end - 1) <= kw.end)
|
||||
continue;
|
||||
|
||||
String nextTag = sentence.posTags[kw.end];
|
||||
String nextNextTag = sentence.posTags[kw.end+1];
|
||||
|
||||
if (isVerb(nextTag) && isDetOrAdverbOrVerbOrNoun(nextNextTag)) {
|
||||
if (languageDefinition.matchGrammarPattern(sentence, PosPatternCategory.SUBJECT_SUFFIX, kw.end)) {
|
||||
var span = new WordSpan(kw.start, kw.end);
|
||||
var rep = new WordRep(sentence, span);
|
||||
|
||||
@@ -94,17 +87,4 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
return tfIdfCounts.getTfIdf(stemmed);
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerbOrNoun(String posTag) {
|
||||
return "DT".equals(posTag) // determinant
|
||||
|| posTag.startsWith("RB") // adverb
|
||||
|| posTag.startsWith("VB") // verb
|
||||
|| posTag.startsWith("JJ") // adjective
|
||||
|| posTag.startsWith("P")
|
||||
|| posTag.startsWith("NN");
|
||||
}
|
||||
|
||||
boolean isVerb(String posTag) {
|
||||
return posTag.startsWith("VB")
|
||||
&& !posTag.equals("VB"); // not interested in the infinitive
|
||||
}
|
||||
}
|
@@ -1,8 +1,7 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
@@ -15,10 +14,12 @@ public class TitleKeywords implements WordReps {
|
||||
private final Set<WordRep> titleKeywords;
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) {
|
||||
titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream()
|
||||
public TitleKeywords(DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
titleKeywords = dld.findSentencesForTag(HtmlTag.TITLE).stream()
|
||||
.flatMap(sent ->
|
||||
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
languageDefinition.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
.limit(100)
|
||||
.collect(Collectors.toSet());
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.keyword;
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@@ -26,14 +26,13 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
private final Set<WordRep> tfIdfHigh;
|
||||
|
||||
public WordsTfIdfCounts(TermFrequencyDict dict,
|
||||
KeywordExtractor keywordExtractor,
|
||||
DocumentLanguageData dld) {
|
||||
this.dict = dict;
|
||||
this.docCount = dict.docCount();
|
||||
|
||||
this.tfIdf = new Object2IntOpenHashMap<>(10_000);
|
||||
this.tfIdfHigh = new HashSet<>(100);
|
||||
|
||||
var counts = getCounts(keywordExtractor, dld);
|
||||
var counts = getCounts(dld);
|
||||
int maxVal = maxValue(counts);
|
||||
Set<String> highTfIdfInstances = new HashSet<>();
|
||||
|
||||
@@ -48,9 +47,10 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
|
||||
// Collect words with a high TF-IDF so that they can be marked with a bit flag
|
||||
|
||||
tfIdfHigh = new HashSet<>(100);
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
for (var sent : dld) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.KEYWORD);
|
||||
for (var span : keywords) {
|
||||
if (highTfIdfInstances.contains(sent.constructStemmedWordFromSpan(span))) {
|
||||
tfIdfHigh.add(new WordRep(sent, span));
|
||||
@@ -60,12 +60,14 @@ public class WordsTfIdfCounts implements WordReps, Comparator<WordRep> {
|
||||
|
||||
}
|
||||
|
||||
private Object2IntOpenHashMap<String> getCounts(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
|
||||
private Object2IntOpenHashMap<String> getCounts(DocumentLanguageData dld) {
|
||||
LanguageDefinition languageDefinition = dld.language();
|
||||
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
counts.defaultReturnValue(0);
|
||||
|
||||
for (var sent : dld) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
var keywords = languageDefinition.matchGrammarPattern(sent, PosPatternCategory.KEYWORD);
|
||||
for (var span : keywords) {
|
||||
counts.addTo(sent.constructStemmedWordFromSpan(span), 1);
|
||||
}
|
@@ -0,0 +1,23 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DocumentKeywords(List<String> keywords,
|
||||
byte[] metadata,
|
||||
List<VarintCodedSequence> positions,
|
||||
byte[] spanCodes,
|
||||
List<VarintCodedSequence> spanSequences) {
|
||||
|
||||
public boolean isEmpty() {
|
||||
return keywords.isEmpty();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return keywords.size();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@@ -5,13 +5,11 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.*;
|
||||
|
||||
public class DocumentKeywordsBuilder {
|
||||
@@ -29,6 +27,7 @@ public class DocumentKeywordsBuilder {
|
||||
// be plenty. The lexicon writer has another limit that's higher.
|
||||
private final int MAX_WORD_LENGTH = 64;
|
||||
private final int MAX_POSITIONS_PER_WORD = 512;
|
||||
private final int MAX_SPANS_PER_TYPE = 8192;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DocumentKeywordsBuilder.class);
|
||||
|
||||
@@ -36,13 +35,22 @@ public class DocumentKeywordsBuilder {
|
||||
this(1600);
|
||||
}
|
||||
|
||||
public DocumentKeywords build(ByteBuffer workArea) {
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywords build() {
|
||||
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
||||
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
||||
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
||||
final List<VarintCodedSequence> spanSequences = new ArrayList<>(wordSpans.size());
|
||||
final byte[] spanCodes = new byte[wordSpans.size()];
|
||||
|
||||
var iter = wordToMeta.object2ByteEntrySet().fastIterator();
|
||||
|
||||
// Encode positions
|
||||
while (iter.hasNext()) {
|
||||
var entry = iter.next();
|
||||
|
||||
@@ -59,27 +67,26 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
// Encode spans
|
||||
List<CodedWordSpan> spans = new ArrayList<>(wordSpans.size());
|
||||
|
||||
wordSpans.forEach((tag, spansForTag) -> {
|
||||
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
|
||||
|
||||
var positionsForTag = new IntArrayList(spansForTag.size() * 2);
|
||||
|
||||
for (var span : spansForTag) {
|
||||
positionsForTag.add(span.start());
|
||||
positionsForTag.add(span.end());
|
||||
|
||||
if (positionsForTag.size() >= MAX_SPANS_PER_TYPE)
|
||||
break;
|
||||
}
|
||||
|
||||
spans.add(new CodedWordSpan(tag.code, VarintCodedSequence.generate(positionsForTag)));
|
||||
spanCodes[spanSequences.size()] = tag.code;
|
||||
spanSequences.add(VarintCodedSequence.generate(positionsForTag));
|
||||
});
|
||||
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spans);
|
||||
return new DocumentKeywords(wordArray, meta.toArray(), positions, spanCodes, spanSequences);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
public void addMeta(String word, byte meta) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
@@ -174,6 +181,4 @@ public class DocumentKeywordsBuilder {
|
||||
return this.importantWords;
|
||||
}
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
||||
}
|
@@ -0,0 +1,6 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
@@ -0,0 +1,179 @@
|
||||
package nu.marginalia.language;
|
||||
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
public class LanguageProcessingTool extends Jooby {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final TermFrequencyDict termFrequencyDict;
|
||||
|
||||
static void main(String[] args) {
|
||||
Jooby.runApp(args, LanguageProcessingTool::new);
|
||||
}
|
||||
|
||||
public LanguageProcessingTool() {
|
||||
try {
|
||||
LanguageModels languageModels = getLanguageModels();
|
||||
termFrequencyDict = new TermFrequencyDict(languageModels);
|
||||
|
||||
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
|
||||
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
|
||||
languageModels
|
||||
);
|
||||
|
||||
// Depending on how the tool is started, we may be in the project root, or the module root;
|
||||
// so here's some guesswork to try to suss out which one it is...
|
||||
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
|
||||
if (!Files.exists(basePath)) {
|
||||
basePath = Path.of(".").toAbsolutePath();
|
||||
}
|
||||
|
||||
System.out.println("Base path: " + basePath);
|
||||
|
||||
if (Files.exists(basePath.resolve("resources/ltt/jte")))
|
||||
install(new nu.marginalia.service.server.jte.JteModule(basePath.resolve("resources/ltt/jte")));
|
||||
if (Files.exists(basePath.resolve("resources/ltt/static")))
|
||||
assets("/*", basePath.resolve("resources/ltt/static"));
|
||||
|
||||
get("/", this::handleKeywords);
|
||||
post("/", this::handleKeywords);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to initialize LanguageProcessingTool", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
// Assign colors to the POS tags
|
||||
|
||||
@NotNull
|
||||
private ModelAndView<?> handleKeywords(Context context) throws URISyntaxException {
|
||||
if ("GET".equals(context.getMethod())) {
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", "");
|
||||
}
|
||||
else if (!"POST".equals(context.getMethod())) {
|
||||
throw new IllegalArgumentException("Invalid method");
|
||||
}
|
||||
|
||||
String textSample = context.form("textSample").value();
|
||||
|
||||
// Run sentende extration on the text as-is
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
|
||||
|
||||
// Run individual extraction logic
|
||||
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, dld);
|
||||
var titleKeywords = new TitleKeywords(dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
|
||||
// Run full extraction logic to capture positioning etc
|
||||
var extractedKeywords = new DocumentKeywordExtractor(termFrequencyDict)
|
||||
.extractKeywords(dld, new LinkTexts(), new EdgeUrl("https://www.example.com/"));
|
||||
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", textSample)
|
||||
.put("language", dld.language())
|
||||
.put("tagColors", posTagStyles(dld))
|
||||
.put("sentences", dld.sentences())
|
||||
.put("tfIdfReps", tfIdfCounts.getReps())
|
||||
.put("titleReps", titleKeywords.getReps())
|
||||
.put("nameLikeReps", nameLikeKeywords.getReps())
|
||||
.put("subjectLikeReps", subjectLikeKeywords.getReps())
|
||||
.put("artifacts", artifactKeywords.getWords())
|
||||
.put("importantWords", extractedKeywords.importantWords)
|
||||
.put("positionedWords", extractedKeywords.wordToPos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate unique colors for each POS tag, to help the UI rendering
|
||||
*/
|
||||
public static Map<Long, String> posTagStyles(DocumentLanguageData dld) {
|
||||
Map<Long, String> styles = new HashMap<>();
|
||||
|
||||
// we sort them first to ensure the most common tags are guaranteed to have
|
||||
// the largest difference between colors
|
||||
|
||||
Map<Long, Integer> counts = new HashMap<>();
|
||||
for (var sentence : dld.sentences()) {
|
||||
for (var tag : sentence.posTags) {
|
||||
counts.merge(tag, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
|
||||
List<Long> posTagsByCount = counts
|
||||
.entrySet().stream()
|
||||
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
|
||||
.map(Map.Entry::getKey)
|
||||
.toList();
|
||||
|
||||
|
||||
for (int i = 0; i < posTagsByCount.size(); i++) {
|
||||
String style = "text-" + switch (i&0x7) {
|
||||
case 0 -> "red";
|
||||
case 1 -> "green";
|
||||
case 2 -> "blue";
|
||||
case 3 -> "yellow";
|
||||
case 4 -> "purple";
|
||||
case 5 -> "cyan";
|
||||
case 6 -> "pink";
|
||||
default -> "gray";
|
||||
}+"-"+switch((i/8) & 3) {
|
||||
case 0 -> "900";
|
||||
case 3 -> "500";
|
||||
case 1 -> "750";
|
||||
case 2 -> "400";
|
||||
default -> "300";
|
||||
};
|
||||
styles.put(posTagsByCount.get(i), style);
|
||||
}
|
||||
return styles;
|
||||
}
|
||||
|
||||
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
||||
private static Path getLanguageModelsPath() {
|
||||
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
||||
.map(Path::of)
|
||||
.orElse(LANGUAGE_MODELS_DEFAULT);
|
||||
|
||||
if (!Files.isDirectory(languageModelsHome)) {
|
||||
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
||||
}
|
||||
return languageModelsHome;
|
||||
}
|
||||
private static LanguageModels getLanguageModels() {
|
||||
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("lid.176.ftz"),
|
||||
languageModelsHome.resolve("segments.bin")
|
||||
);
|
||||
}
|
||||
}
|
@@ -0,0 +1,43 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
sealed public interface LanguageConfigLocation {
|
||||
InputStream findLanguageConfiguration() throws IOException;
|
||||
|
||||
final class Auto implements LanguageConfigLocation {
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
Path filesystemPath = WmsaHome.getLangugeConfig();
|
||||
if (Files.exists(filesystemPath)) {
|
||||
return Files.newInputStream(filesystemPath, StandardOpenOption.READ);
|
||||
}
|
||||
if (Boolean.getBoolean("language.experimental")) {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
} else {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final class Experimental implements LanguageConfigLocation {
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
}
|
||||
}
|
||||
|
||||
final class Default implements LanguageConfigLocation {
|
||||
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,405 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import com.github.jfasttext.JFastText;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.pos.PosTagger;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.security.DigestInputStream;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.*;
|
||||
|
||||
@Singleton
|
||||
public class LanguageConfiguration {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageConfiguration.class);
|
||||
|
||||
private final Map<String, Path> resources = new HashMap<>();
|
||||
private final Map<String, LanguageDefinition> languages = new LinkedHashMap<>();
|
||||
private final JFastText fastTextLanguageModel = new JFastText();
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(org.jsoup.nodes.Document jsoupDoc) {
|
||||
StringBuilder sampleBuilder = new StringBuilder();
|
||||
jsoupDoc.body().traverse((node, _) -> {
|
||||
if (sampleBuilder.length() > 4096)
|
||||
return;
|
||||
if (!(node instanceof TextNode tn))
|
||||
return;
|
||||
|
||||
sampleBuilder.append(' ').append(tn.text());
|
||||
});
|
||||
return identifyLanguage(sampleBuilder.toString());
|
||||
}
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample) {
|
||||
String prediction = fastTextLanguageModel.predict(sample);
|
||||
if (null == prediction)
|
||||
return Optional.empty();
|
||||
|
||||
if (prediction.length() == "__label__??".length()) {
|
||||
String isoCode = prediction.substring("__label__".length());
|
||||
return Optional.ofNullable(getLanguage(isoCode));
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(String sample, String fallbackIsoCode) {
|
||||
return identifyLanguage(sample).or(() -> Optional.ofNullable(getLanguage(fallbackIsoCode)));
|
||||
}
|
||||
|
||||
public List<LanguageDefinition> languages() {
|
||||
return new ArrayList<>(this.languages.values());
|
||||
}
|
||||
public Map<String, LanguageDefinition> languagesMap() {
|
||||
return Collections.unmodifiableMap(languages);
|
||||
}
|
||||
@Nullable
|
||||
public LanguageDefinition getLanguage(String language) {
|
||||
return languages.get(language);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public LanguageConfiguration() throws IOException, ParserConfigurationException, SAXException {
|
||||
this(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Auto());
|
||||
}
|
||||
|
||||
public LanguageConfiguration(LanguageConfigLocation languageFile) throws IOException, ParserConfigurationException, SAXException {
|
||||
this(WmsaHome.getLanguageModels(), languageFile);
|
||||
}
|
||||
|
||||
public LanguageConfiguration(LanguageModels lm, LanguageConfigLocation languageFile)
|
||||
throws IOException, ParserConfigurationException, SAXException {
|
||||
fastTextLanguageModel.loadModel(lm.fasttextLanguageModel.toString());
|
||||
|
||||
try (var languagesXmlStream = languageFile.findLanguageConfiguration()) {
|
||||
if (languagesXmlStream == null)
|
||||
throw new IllegalStateException("languages-default.xml resource not found in classpath");
|
||||
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
Document doc = builder.parse(languagesXmlStream);
|
||||
|
||||
parseResources(doc);
|
||||
parseLanguages(doc);
|
||||
}
|
||||
|
||||
logger.info("Loaded language configuration: {}", languages);
|
||||
}
|
||||
|
||||
private void parseLanguages(Document doc) {
|
||||
NodeList languageNodes = doc.getElementsByTagName("language");
|
||||
|
||||
for (int i = 0; i < languageNodes.getLength(); i++) {
|
||||
Element languageTag = (Element) languageNodes.item(i);
|
||||
|
||||
boolean disabled = "TRUE".equalsIgnoreCase(languageTag.getAttribute("disabled"));
|
||||
if (disabled)
|
||||
continue;
|
||||
|
||||
String isoCode = languageTag.getAttribute("isoCode").toLowerCase();
|
||||
String name = languageTag.getAttribute("name");
|
||||
|
||||
try {
|
||||
PosTagger posTagger = parsePosTag(languageTag, isoCode);
|
||||
Stemmer stemmer = parseStemmerTag(languageTag, posTagger, isoCode);
|
||||
KeywordHasher keywordHasher = parseHasherTag(languageTag, isoCode);
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns =
|
||||
parsePosPatterns(posTagger, languageTag, isoCode);
|
||||
UnicodeNormalization unicodeNormalization = parseUnicodeNormalization(languageTag, isoCode);
|
||||
|
||||
languages.put(isoCode,
|
||||
new LanguageDefinition(isoCode, name, stemmer, unicodeNormalization, keywordHasher, posTagger, posPatterns));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to set up language " + isoCode, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private UnicodeNormalization parseUnicodeNormalization(Element languageTag, String isoCode) {
|
||||
NodeList normalizationTags = languageTag.getElementsByTagName("unicodeNormalization");
|
||||
if (normalizationTags.getLength() == 0)
|
||||
return new UnicodeNormalization.JustNormalizeQuotes();
|
||||
Element normalizationTag = (Element) normalizationTags.item(0);
|
||||
String algorithm = normalizationTag.getAttribute("algorithm");
|
||||
|
||||
return switch(algorithm) {
|
||||
case "minimal" -> new UnicodeNormalization.JustNormalizeQuotes();
|
||||
case "e-accents" -> new UnicodeNormalization.FlattenEAccents();
|
||||
case "german" -> new UnicodeNormalization.Flattenß();
|
||||
case "maximal-latin" -> new UnicodeNormalization.FlattenAllLatin();
|
||||
default -> throw new IllegalArgumentException("Invalida algorithm " + algorithm + " on language configuration for " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
|
||||
Element languageTag, String isoCode) {
|
||||
if (null == posTagger)
|
||||
return Map.of();
|
||||
|
||||
Map<PosPatternCategory, List<PosPattern>> ret = new HashMap<>();
|
||||
NodeList ngramsElements = languageTag.getElementsByTagName("ngrams");
|
||||
|
||||
for (int i = 0; i < ngramsElements.getLength(); i++) {
|
||||
Element ngramsTag = (Element) ngramsElements.item(i);
|
||||
String type = ngramsTag.getAttribute("type");
|
||||
|
||||
PosPatternCategory category = switch(type) {
|
||||
case "name" -> PosPatternCategory.NAME;
|
||||
case "noun" -> PosPatternCategory.NOUN;
|
||||
case "keyword" -> PosPatternCategory.KEYWORD;
|
||||
case "title" -> PosPatternCategory.TITLE;
|
||||
case "subject-suffix" -> PosPatternCategory.SUBJECT_SUFFIX;
|
||||
default -> throw new IllegalArgumentException("Invalid ngrams type in " + isoCode + ", what is '" + type + "'?");
|
||||
};
|
||||
|
||||
NodeList posPatternsList = ngramsTag.getElementsByTagName("pospattern");
|
||||
for (int j = 0; j < posPatternsList.getLength(); j++) {
|
||||
Element posPatternTag = (Element) posPatternsList.item(j);
|
||||
ret.computeIfAbsent(category, (k) -> new ArrayList<>())
|
||||
.add(new PosPattern(posTagger, posPatternTag.getTextContent()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private PosTagger parsePosTag(Element languageTag, String isoCode) throws IOException {
|
||||
NodeList rdrElements = languageTag.getElementsByTagName("rdrTagger");
|
||||
if (rdrElements.getLength() < 1) {
|
||||
return null;
|
||||
}
|
||||
else if (rdrElements.getLength() > 1) {
|
||||
throw new IllegalStateException("Multiple rdr taggers defined in " + isoCode);
|
||||
}
|
||||
Element rdrElement = (Element) rdrElements.item(0);
|
||||
|
||||
String dictId = rdrElement.getAttribute("dictId");
|
||||
String rdrId = rdrElement.getAttribute("rdrId");
|
||||
|
||||
Path dictPath = resources.get(dictId);
|
||||
Path rdrPath = resources.get(rdrId);
|
||||
|
||||
if (null == dictPath)
|
||||
throw new IllegalArgumentException("language.xml: dictPath id " + dictId
|
||||
+ " does not map to a resource in " + isoCode);
|
||||
if (null == rdrPath)
|
||||
throw new IllegalArgumentException("language.xml: rdrPath id " + dictId
|
||||
+ " does not map to a resource in " + isoCode);
|
||||
|
||||
return new PosTagger(isoCode, dictPath, rdrPath);
|
||||
}
|
||||
|
||||
|
||||
private KeywordHasher parseHasherTag(Element languageElement, String isoCode) {
|
||||
NodeList keywordHasherElements = languageElement.getElementsByTagName("keywordHash");
|
||||
if (keywordHasherElements.getLength() != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"language.xml: No keywordHasher block for language element " + isoCode);
|
||||
}
|
||||
Element keywordHasheElement = (Element) keywordHasherElements.item(0);
|
||||
|
||||
String hasherName = keywordHasheElement.getAttribute("algorithm");
|
||||
|
||||
return switch (hasherName) {
|
||||
case "asciish" -> new KeywordHasher.AsciiIsh();
|
||||
case "utf8" -> new KeywordHasher.Utf8();
|
||||
default -> throw new IllegalArgumentException(
|
||||
"language.xml: Unknown keywordHash name " + hasherName + " in " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private Stemmer parseStemmerTag(Element languageElement, PosTagger posTagger, String isoCode) {
|
||||
NodeList stemmerElements = languageElement.getElementsByTagName("stemmer");
|
||||
if (stemmerElements.getLength() != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"language.xml: No stemmer block for language element " + isoCode);
|
||||
}
|
||||
Element stemmerElement = (Element) stemmerElements.item(0);
|
||||
|
||||
String stemmerName = stemmerElement.getAttribute("algorithm");
|
||||
String stemmerVariant = stemmerElement.getAttribute("variant");
|
||||
|
||||
PosPattern inclusionPattern = null;
|
||||
NodeList posPatternList = stemmerElement.getElementsByTagName("pospattern");
|
||||
if (posPatternList.getLength() >= 1) {
|
||||
Element posElement = (Element) posPatternList.item(0);
|
||||
inclusionPattern = new PosPattern(posTagger, posElement.getTextContent());
|
||||
}
|
||||
|
||||
return switch (stemmerName.toLowerCase()) {
|
||||
case "porter" -> new Stemmer.Porter(inclusionPattern);
|
||||
case "snowball" -> new Stemmer.Snowball(stemmerVariant, inclusionPattern);
|
||||
case "none" -> new Stemmer.NoOpStemmer();
|
||||
default -> throw new IllegalArgumentException(
|
||||
"language.xml: Unknown stemmer name " + stemmerName + " in " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private void parseResources(Document doc) throws IOException {
|
||||
NodeList resourceNodes = doc.getElementsByTagName("resource");
|
||||
for (int i = 0; i < resourceNodes.getLength(); i++) {
|
||||
Element resourceTag = (Element) resourceNodes.item(i);
|
||||
|
||||
String resourceId = resourceTag.getAttribute("id");
|
||||
String resourceMd5 = resourceTag.getAttribute("md5");
|
||||
Path resourcePath = WmsaHome.getDataPath().resolve(resourceTag.getAttribute("path"));
|
||||
String resourceHref = resourceTag.getAttribute("href");
|
||||
|
||||
if (!validateResource(resourcePath, resourceMd5)) {
|
||||
boolean success = false;
|
||||
try {
|
||||
success = fetchResource(resourceHref, resourcePath, resourceMd5);
|
||||
} catch (URISyntaxException | IOException ex) {
|
||||
logger.error(ex.getMessage(), ex);
|
||||
success = false;
|
||||
}
|
||||
|
||||
// It's likely if we were to just explode here, that a docker-compose restart:always
|
||||
// would put us in a
|
||||
// loop that repeatedly fails to download the same file. We'd like to avoid that by
|
||||
// stalling and
|
||||
// awaiting human intervention.
|
||||
|
||||
while (!success) {
|
||||
logger.error("Stopping to prevent restart loop");
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (resources.put(resourceId, resourcePath) != null)
|
||||
throw new IllegalStateException(
|
||||
"Resource with id " + resourceId + " already exists");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean fetchResource(String resourceUrl, Path resourcePath, String resourceMd5)
|
||||
throws IOException, URISyntaxException {
|
||||
|
||||
Path parentPath = resourcePath.getParent();
|
||||
if (!Files.isDirectory(parentPath)) {
|
||||
logger.info("Setting up directory {}", parentPath);
|
||||
Files.createDirectories(parentPath);
|
||||
}
|
||||
|
||||
logger.info("Fetching {}", resourceUrl);
|
||||
|
||||
URL url = new URI(resourceUrl).toURL();
|
||||
Path tempFile = Files.createTempFile("resource", "dat");
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
try (InputStream is = conn.getInputStream();
|
||||
OutputStream os = Files.newOutputStream(tempFile, StandardOpenOption.WRITE,
|
||||
StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
is.transferTo(os);
|
||||
os.flush();
|
||||
|
||||
String actualMd5 = getFileMD5(tempFile);
|
||||
if (!resourceMd5.isBlank() && !Objects.equals(resourceMd5, actualMd5)) {
|
||||
logger.error("Freshly downloaded resource {} does not match md5sum {}", resourceUrl,
|
||||
resourceMd5);
|
||||
return false;
|
||||
} else {
|
||||
logger.info("Downloaded resource {} to {} ** md5sum {}", resourceUrl, resourcePath,
|
||||
actualMd5);
|
||||
Files.move(tempFile, resourcePath, StandardCopyOption.REPLACE_EXISTING);
|
||||
return true;
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
logger.error("IOException", ex);
|
||||
return false;
|
||||
} finally {
|
||||
conn.disconnect();
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean validateResource(Path resourcePath, String providedMd5Sum) throws IOException {
|
||||
resourcePath = resourcePath.normalize();
|
||||
|
||||
if (!resourcePath.normalize().startsWith(WmsaHome.getDataPath()))
|
||||
throw new IllegalArgumentException(
|
||||
"Resource path has escaped $WMSA_HOME/data: " + resourcePath);
|
||||
if (!Files.exists(resourcePath)) {
|
||||
logger.info("Resource path does not exist: " + resourcePath);
|
||||
return false;
|
||||
}
|
||||
|
||||
String actualMd5 = getFileMD5(resourcePath);
|
||||
if (providedMd5Sum.isBlank()) {
|
||||
logger.info("No md5sum provided for resource path: {}, but was calculated to {}",
|
||||
resourcePath, actualMd5);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (Objects.equals(actualMd5, providedMd5Sum)) {
|
||||
return true;
|
||||
} else {
|
||||
logger.error("MD5 checksum mismatch for {} -- {}", resourcePath, providedMd5Sum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public String getFileMD5(Path filePath) {
|
||||
try (InputStream fis = Files.newInputStream(filePath)) {
|
||||
MessageDigest md = MessageDigest.getInstance("MD5");
|
||||
DigestInputStream dis = new DigestInputStream(fis, md);
|
||||
|
||||
// Read the file
|
||||
byte[] buffer = new byte[8192];
|
||||
while (dis.read(buffer) != -1) {
|
||||
// Reading updates the digest
|
||||
}
|
||||
|
||||
byte[] digest = md.digest();
|
||||
|
||||
// Convert to hex
|
||||
StringBuilder hexString = new StringBuilder();
|
||||
for (byte b : digest) {
|
||||
String hex = Integer.toHexString(0xff & b);
|
||||
if (hex.length() == 1) {
|
||||
hexString.append('0');
|
||||
}
|
||||
hexString.append(hex);
|
||||
}
|
||||
return hexString.toString();
|
||||
} catch (IOException | NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,227 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
public interface UnicodeNormalization {
|
||||
|
||||
String flattenUnicode(String s);
|
||||
|
||||
static final boolean NO_FLATTEN_UNICODE =
|
||||
Boolean.getBoolean("system.noFlattenUnicode");
|
||||
|
||||
class JustNormalizeQuotes implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenEAccents implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
for (int i = 0; i < numCp;) {
|
||||
int c = s.codePointAt(i);
|
||||
i+=Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("é".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class Flattenß implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
} else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenAllLatin implements UnicodeNormalization {
|
||||
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
// Falsehoods programmers believe about the latin alphabet ;-)
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
|
||||
sb.append('a');
|
||||
}
|
||||
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
|
||||
sb.append('b');
|
||||
}
|
||||
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
|
||||
sb.append('c');
|
||||
}
|
||||
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
|
||||
sb.append('d');
|
||||
}
|
||||
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else if ("ḟƒ".indexOf(c) >= 0) {
|
||||
sb.append('f');
|
||||
}
|
||||
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
|
||||
sb.append('i');
|
||||
}
|
||||
else if ("ĵǰɉ".indexOf(c) >= 0) {
|
||||
sb.append('j');
|
||||
}
|
||||
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
|
||||
sb.append('k');
|
||||
}
|
||||
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
|
||||
sb.append('l');
|
||||
}
|
||||
else if ("ḿṁṃ".indexOf(c) >= 0) {
|
||||
sb.append('m');
|
||||
}
|
||||
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
|
||||
sb.append('n');
|
||||
}
|
||||
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
|
||||
sb.append('o');
|
||||
}
|
||||
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
|
||||
sb.append('p');
|
||||
}
|
||||
else if ("ꝗ".indexOf(c) >= 0) {
|
||||
sb.append('q');
|
||||
}
|
||||
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
|
||||
sb.append('r');
|
||||
}
|
||||
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
|
||||
sb.append('s');
|
||||
}
|
||||
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
|
||||
sb.append('t');
|
||||
}
|
||||
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
|
||||
sb.append('u');
|
||||
}
|
||||
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
|
||||
sb.append('v');
|
||||
}
|
||||
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
|
||||
sb.append('w');
|
||||
}
|
||||
else if ("x̂ẍẋ".indexOf(c) >= 0) {
|
||||
sb.append('x');
|
||||
}
|
||||
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
|
||||
sb.append('y');
|
||||
}
|
||||
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
|
||||
sb.append('z');
|
||||
}
|
||||
else if ("Þþ".indexOf(c) >= 0) {
|
||||
sb.append("th");
|
||||
}
|
||||
else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else if (isAscii(c)) {
|
||||
sb.append((char) c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static boolean isPlainAscii(String s) {
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
if (!isAscii(c))
|
||||
return false;
|
||||
i += Character.charCount(c);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean isAscii(int c) {
|
||||
return (c & ~0x7f) == 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,27 @@
|
||||
package nu.marginalia.language.keywords;
|
||||
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
|
||||
public sealed interface KeywordHasher {
|
||||
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
|
||||
long hashKeyword(String keyword);
|
||||
|
||||
/** Hash algorithm that seeds a Murmur128 algorithm with Java's string hashCode(), but
|
||||
* then only looks at 7 bit ASCII for the Murmur calculations. This works well for English
|
||||
* and similar languages, but falls apart completely for languages that are not dominated by
|
||||
* the 7 bit ASCII subset.
|
||||
*/
|
||||
final class AsciiIsh implements KeywordHasher {
|
||||
public long hashKeyword(String keyword) {
|
||||
return hasher.hashNearlyASCII(keyword);
|
||||
}
|
||||
}
|
||||
|
||||
/** Hash algorithm that is based on Murmur128 folded over on itself to make a 64 bit key */
|
||||
final class Utf8 implements KeywordHasher {
|
||||
public long hashKeyword(String keyword) {
|
||||
return hasher.hashUtf8(keyword);
|
||||
}
|
||||
}
|
||||
}
|
@@ -15,11 +15,13 @@ import java.util.stream.Stream;
|
||||
*
|
||||
* @see SentenceExtractor
|
||||
*/
|
||||
public record DocumentLanguageData(List<DocumentSentence> sentences, String text) implements Iterable<DocumentSentence> {
|
||||
public record DocumentLanguageData(LanguageDefinition language,
|
||||
List<DocumentSentence> sentences,
|
||||
String text) implements Iterable<DocumentSentence> {
|
||||
|
||||
public DocumentLanguageData(List<DocumentSentence> sentences,
|
||||
String text)
|
||||
public DocumentLanguageData(LanguageDefinition language, List<DocumentSentence> sentences, String text)
|
||||
{
|
||||
this.language = language;
|
||||
this.sentences = Collections.unmodifiableList(sentences);
|
||||
this.text = text;
|
||||
}
|
@@ -19,13 +19,14 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
/** A span of words in a sentence */
|
||||
public final String[] wordsLowerCase;
|
||||
public final String[] stemmedWords;
|
||||
public final String[] posTags;
|
||||
public final long[] posTags;
|
||||
|
||||
/** A set of HTML tags that surround the sentence */
|
||||
public final EnumSet<HtmlTag> htmlTags;
|
||||
|
||||
/** A bitset indicating whether the word is a stop word */
|
||||
private final BitSet isStopWord;
|
||||
private final BitSet includeInStemming;
|
||||
|
||||
/** A bitset indicating whether the word is capitalized */
|
||||
private final BitSet isCapitalized;
|
||||
@@ -37,16 +38,16 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
// where false = COMMA, true = SPACE
|
||||
private final BitSet separators;
|
||||
|
||||
|
||||
public SoftReference<WordSpan[]> keywords;
|
||||
|
||||
public DocumentSentence(BitSet separators,
|
||||
String[] wordsLowerCase,
|
||||
String[] posTags,
|
||||
long[] posTags,
|
||||
String[] stemmedWords,
|
||||
EnumSet<HtmlTag> htmlTags,
|
||||
BitSet isCapitalized,
|
||||
BitSet isAllCaps
|
||||
BitSet isAllCaps,
|
||||
BitSet includeInStemming
|
||||
)
|
||||
{
|
||||
this.separators = separators;
|
||||
@@ -56,6 +57,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
this.htmlTags = htmlTags;
|
||||
this.isCapitalized = isCapitalized;
|
||||
this.isAllCaps = isAllCaps;
|
||||
this.includeInStemming = includeInStemming;
|
||||
|
||||
isStopWord = new BitSet(wordsLowerCase.length);
|
||||
|
||||
@@ -87,6 +89,16 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
return !separators.get(i);
|
||||
}
|
||||
|
||||
/** Returns the position of the next comma in the sentence,
|
||||
* or sentence.length() if no remaining commas exist.
|
||||
*/
|
||||
public int nextCommaPos(int pos) {
|
||||
int ret = separators.nextClearBit(pos);
|
||||
if (ret < 0)
|
||||
return separators.length();
|
||||
return ret;
|
||||
}
|
||||
|
||||
public String constructWordFromSpan(WordSpan span) {
|
||||
if (span.size() == 1) {
|
||||
return trimJunkCharacters(wordsLowerCase[span.start]);
|
||||
@@ -153,10 +165,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
}
|
||||
|
||||
private boolean includeInStemming(int i) {
|
||||
if (posTags[i].equals("IN") || posTags[i].equals("TO") || posTags[i].equals("CC") || posTags[i].equals("DT")) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
return includeInStemming.get(i);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -199,7 +208,7 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>
|
||||
}
|
||||
|
||||
public String wordLowerCase() { return wordsLowerCase[pos]; }
|
||||
public String posTag() { return posTags[pos]; }
|
||||
public long posTag() { return posTags[pos]; }
|
||||
public String stemmed() { return stemmedWords[pos]; }
|
||||
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
|
||||
|
@@ -0,0 +1,145 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
import nu.marginalia.language.pos.PosTagger;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public final class LanguageDefinition {
|
||||
private final String isoCode;
|
||||
private final String name;
|
||||
private final Stemmer stemmer;
|
||||
private final UnicodeNormalization unicodeNormalization;
|
||||
private final KeywordHasher keywordHasher;
|
||||
|
||||
@Nullable
|
||||
private final PosTagger posTagger;
|
||||
|
||||
private final Map<PosPatternCategory, List<PosPattern>> posPatterns;
|
||||
public LanguageDefinition(String isoCode,
|
||||
String name,
|
||||
Stemmer stemmer,
|
||||
UnicodeNormalization unicodeNormalization,
|
||||
KeywordHasher keywordHasher,
|
||||
@Nullable PosTagger posTagger,
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns) {
|
||||
this.isoCode = isoCode;
|
||||
this.name = name;
|
||||
this.stemmer = stemmer;
|
||||
this.unicodeNormalization = unicodeNormalization;
|
||||
this.keywordHasher = keywordHasher;
|
||||
this.posTagger = posTagger;
|
||||
this.posPatterns = posPatterns;
|
||||
}
|
||||
|
||||
public String isoCode() {
|
||||
return isoCode;
|
||||
}
|
||||
|
||||
public String displayName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public Stemmer stemmer() {
|
||||
return stemmer;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public PosTagger posTagger() {
|
||||
return posTagger;
|
||||
}
|
||||
|
||||
public KeywordHasher keywordHasher() {
|
||||
return keywordHasher;
|
||||
}
|
||||
|
||||
public UnicodeNormalization unicodeNormalization() {
|
||||
return unicodeNormalization;
|
||||
}
|
||||
|
||||
public long[] posTagSentence(String[] words) {
|
||||
if (posTagger == null) return new long[0];
|
||||
return posTagger.tagSentence(words);
|
||||
}
|
||||
|
||||
public boolean hasPosParsing() {
|
||||
return posTagger != null;
|
||||
}
|
||||
|
||||
public List<PosPattern> getPosPatterns(PosPatternCategory category) {
|
||||
return posPatterns.getOrDefault(category, List.of());
|
||||
}
|
||||
|
||||
public String decodePosTagName(long tagName) {
|
||||
if (hasPosParsing())
|
||||
return posTagger.decodeTagName(tagName);
|
||||
return "";
|
||||
}
|
||||
|
||||
public List<WordSpan> matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category) {
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (PosPattern pattern : getPosPatterns(category)) {
|
||||
pattern.matchSentence(sentence, spans);
|
||||
}
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
public boolean matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category, int pos) {
|
||||
for (var pattern : getPosPatterns(category)) {
|
||||
if (pattern.isMatch(sentence, pos))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean matchGrammarPattern(DocumentSentence sentence, PosPatternCategory category, WordSpan span) {
|
||||
for (var pattern : getPosPatterns(category)) {
|
||||
if (pattern.size() != span.size())
|
||||
continue;
|
||||
|
||||
if (pattern.isMatch(sentence, span.start))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public List<WordSpan> getWordsFromSentence(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>();
|
||||
|
||||
for (int k = 0; k < 4; k++) {
|
||||
for (int i = k; i < sentence.length(); i++) {
|
||||
var w = new WordSpan(i-k, i + 1);
|
||||
|
||||
if (isViableSpanForWord(sentence, w)) {
|
||||
spans.add(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) {
|
||||
|
||||
if (sentence.nextCommaPos(w.start) < w.end - 1)
|
||||
return false;
|
||||
|
||||
if (!matchGrammarPattern(sentence, PosPatternCategory.TITLE, w))
|
||||
return false;
|
||||
|
||||
String word = sentence.constructWordFromSpan(w);
|
||||
return !word.isBlank() && WordPatterns.isNotJunkWord(word);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
public class UnsupportedLanguageException extends Exception {
|
||||
}
|
@@ -7,6 +7,8 @@ public class WordSpan implements Comparable<WordSpan> {
|
||||
public final int end;
|
||||
|
||||
public WordSpan(int start, int end) {
|
||||
assert end >= start;
|
||||
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
@@ -0,0 +1,236 @@
|
||||
package nu.marginalia.language.pos;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class PosPattern {
|
||||
public final LongArrayList pattern = new LongArrayList();
|
||||
private static final Logger logger = LoggerFactory.getLogger(PosPattern.class);
|
||||
|
||||
public long[] toArray() {
|
||||
return pattern.toLongArray();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return pattern.size();
|
||||
}
|
||||
|
||||
public PosPattern(PosTagger posTagger, String expression) {
|
||||
for (List<String> variants : PosTagPatternParser.parse(posTagger, expression)) {
|
||||
pattern.add(posTagger.encodeTagNames(variants));
|
||||
}
|
||||
|
||||
if (pattern.isEmpty()) {
|
||||
throw new IllegalArgumentException("Zero length patterns are not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
public int matchSentence(DocumentSentence sentence, List<WordSpan> ret) {
|
||||
long first = pattern.getLong(0);
|
||||
int cnt = 0;
|
||||
|
||||
// Fast case for 1-length patterns
|
||||
if (pattern.size() == 1) {
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (0L == (sentence.posTags[i] & first)) continue;
|
||||
ret.add(new WordSpan(i, i+1));
|
||||
cnt++;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
pattern:
|
||||
for (int i = 0; i <= sentence.length() - pattern.size(); i++) {
|
||||
|
||||
// Start by matching against the beginning of the pattern
|
||||
// as a fast path
|
||||
if (0L == (sentence.posTags[i] & first)) continue;
|
||||
|
||||
|
||||
int j;
|
||||
for (j = 1; j < pattern.size(); j++) {
|
||||
if (0L == (sentence.posTags[i + j] & pattern.getLong(j)))
|
||||
continue pattern;
|
||||
}
|
||||
|
||||
// Ensure no commas exist in the sentence except for the last word
|
||||
int nextCommaPos = sentence.nextCommaPos(i);
|
||||
if (nextCommaPos < i + pattern.size() - 1) {
|
||||
// note the i++ in the for loop will also be added here, so we're positioned after the next comma
|
||||
// beginning of the next iteration
|
||||
i = nextCommaPos;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Finally add the span
|
||||
ret.add(new WordSpan(i, i+j));
|
||||
cnt++;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
public boolean isMatch(DocumentSentence sentence, int pos) {
|
||||
if (pos + pattern.size() > sentence.length()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long first = pattern.getLong(0);
|
||||
if (0 == (sentence.posTags[pos] & first)) return false;
|
||||
else if (pattern.size() == 1) return true;
|
||||
|
||||
int nextCommaPos = sentence.nextCommaPos(pos);
|
||||
if (nextCommaPos < pos + pattern.size() - 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int j = 1; j < pattern.size(); j++) {
|
||||
if (0L == (sentence.posTags[pos+j] & pattern.getLong(j)))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Return a bit set for every position where this pattern matches the tag sequence provided */
|
||||
public BitSet matchTagPattern(long[] tags) {
|
||||
BitSet bs = new BitSet(tags.length);
|
||||
|
||||
// Fast case for length = 1
|
||||
if (pattern.size() == 1) {
|
||||
long patternVal = pattern.getLong(0);
|
||||
|
||||
for (int i = 0; i < tags.length; i++) {
|
||||
bs.set(i, (patternVal & tags[i]) != 0L);
|
||||
}
|
||||
|
||||
return bs;
|
||||
}
|
||||
|
||||
pattern:
|
||||
for (int i = 0; i <= tags.length - pattern.size(); i++) {
|
||||
int j;
|
||||
|
||||
for (j = 0; j < pattern.size(); j++) {
|
||||
if (0L == (tags[i+j] & pattern.getLong(j)))
|
||||
continue pattern;
|
||||
}
|
||||
|
||||
bs.set(i);
|
||||
}
|
||||
|
||||
return bs;
|
||||
}
|
||||
}
|
||||
|
||||
class PosTagPatternParser {
|
||||
private boolean inverted;
|
||||
private boolean inParen;
|
||||
|
||||
private final List<List<String>> variants = new ArrayList<>();
|
||||
private final List<String> allTags;
|
||||
|
||||
public PosTagPatternParser(PosTagger posTagger) {
|
||||
allTags = Collections.unmodifiableList(posTagger.tags());
|
||||
}
|
||||
|
||||
public static List<List<String>> parse(PosTagger posTagger, String expression) {
|
||||
|
||||
PosTagPatternParser patternBuilder = new PosTagPatternParser(posTagger);
|
||||
|
||||
for (String token : tokenize(expression)) {
|
||||
switch (token) {
|
||||
case "!" -> patternBuilder.invert();
|
||||
case "(" -> patternBuilder.parenOpen();
|
||||
case ")" -> patternBuilder.parenClose();
|
||||
default -> patternBuilder.addToken(token);
|
||||
}
|
||||
}
|
||||
|
||||
return patternBuilder.variants;
|
||||
}
|
||||
|
||||
private static List<String> tokenize(String expression) {
|
||||
List<String> tokens = new ArrayList<>();
|
||||
int pos = 0;
|
||||
|
||||
while (pos < expression.length()) {
|
||||
char c = expression.charAt(pos);
|
||||
if ("()!".indexOf(c) >= 0) {
|
||||
tokens.add(expression.substring(pos, pos + 1));
|
||||
pos++;
|
||||
}
|
||||
else if (Character.isSpaceChar(c)) {
|
||||
pos++;
|
||||
}
|
||||
else {
|
||||
int end = pos + 1;
|
||||
while (end < expression.length()) {
|
||||
int ce = expression.charAt(end);
|
||||
if ("() ".indexOf(ce) >= 0) {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
end++;
|
||||
}
|
||||
}
|
||||
tokens.add(expression.substring(pos, end));
|
||||
pos = end;
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
|
||||
}
|
||||
|
||||
public void invert() {
|
||||
inverted = true;
|
||||
}
|
||||
public void parenOpen() {
|
||||
inParen = true;
|
||||
beginToken();
|
||||
}
|
||||
|
||||
public void parenClose() {
|
||||
inParen = false;
|
||||
inverted = false;
|
||||
}
|
||||
|
||||
private void beginToken() {
|
||||
variants.add(new ArrayList<>());
|
||||
if (inverted)
|
||||
variants.getLast().addAll(allTags);
|
||||
}
|
||||
|
||||
public void addToken(String token) {
|
||||
if (!inParen) beginToken();
|
||||
|
||||
List<String> tokensExpanded;
|
||||
if (token.endsWith("*")) {
|
||||
String prefix = token.substring(0, token.length() - 1);
|
||||
tokensExpanded = allTags.stream().filter(str -> prefix.isEmpty() || str.startsWith(prefix)).toList();
|
||||
}
|
||||
else {
|
||||
tokensExpanded = List.of(token);
|
||||
}
|
||||
|
||||
if (inverted) {
|
||||
variants.getLast().removeAll(tokensExpanded);
|
||||
}
|
||||
else {
|
||||
variants.getLast().addAll(tokensExpanded);
|
||||
}
|
||||
|
||||
if (!inParen) {
|
||||
inverted = false;
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,9 @@
|
||||
package nu.marginalia.language.pos;
|
||||
|
||||
public enum PosPatternCategory {
|
||||
NAME,
|
||||
NOUN,
|
||||
KEYWORD,
|
||||
TITLE,
|
||||
SUBJECT_SUFFIX
|
||||
}
|
@@ -0,0 +1,130 @@
|
||||
package nu.marginalia.language.pos;
|
||||
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PosTagger {
|
||||
private final RDRPOSTagger rdrposTagger;
|
||||
public final Map<String, Integer> tagDict;
|
||||
public final List<String> tagNames;
|
||||
private final String isoCode;
|
||||
|
||||
public PosTagger(String isoCode, Path dictFilePath, Path rdrFilePath) throws IOException {
|
||||
this.isoCode = isoCode;
|
||||
rdrposTagger = new RDRPOSTagger(dictFilePath, rdrFilePath);
|
||||
|
||||
List<String> tagNames = new ArrayList<>();
|
||||
HashMap<String, Integer> tags = new HashMap<>();
|
||||
try (var linesStream = Files.lines(dictFilePath)) {
|
||||
linesStream.map(line -> StringUtils.split(line, " ", 2))
|
||||
.filter(line -> line.length==2)
|
||||
.map(line -> line[1])
|
||||
.distinct()
|
||||
.forEach(tag -> {
|
||||
tags.putIfAbsent(tag, tagNames.size());
|
||||
tagNames.add(tag);
|
||||
});
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
this.tagDict = Collections.unmodifiableMap(tags);
|
||||
this.tagNames = Collections.unmodifiableList(tagNames);
|
||||
}
|
||||
|
||||
/** Alternate constructor for tests */
|
||||
public PosTagger(String isoCode, List<String> tags) {
|
||||
this.isoCode = isoCode;
|
||||
this.tagNames = tags.stream().distinct().toList();
|
||||
this.tagDict = tags.stream().distinct().collect(Collectors.toMap(Function.identity(), tagNames::indexOf, (a,b)->a));
|
||||
this.rdrposTagger = null;
|
||||
}
|
||||
|
||||
public long[] tagSentence(String[] words) {
|
||||
String[] tags;
|
||||
|
||||
// Unclear if this is necessary, but the library does have a different function for tagging English
|
||||
if ("en".equalsIgnoreCase(isoCode)) {
|
||||
tags = rdrposTagger.tagsForEnSentence(words);
|
||||
}
|
||||
else {
|
||||
tags = rdrposTagger.tagSentence(words);
|
||||
}
|
||||
|
||||
// Encode the tags as a bit mask. These will just have one (or zero) bits set
|
||||
// but will match against more complex masks
|
||||
|
||||
long[] encodedTags = new long[tags.length];
|
||||
for (int i = 0; i < encodedTags.length; i++) {
|
||||
encodedTags[i] = encodeTagName(tags[i]);
|
||||
}
|
||||
|
||||
return encodedTags;
|
||||
}
|
||||
|
||||
public long encodeTagName(String tagName) {
|
||||
Integer tag = tagDict.get(tagName);
|
||||
if (tag == null) {
|
||||
return 0L;
|
||||
}
|
||||
return 1L << tag;
|
||||
}
|
||||
|
||||
public long encodeTagNames(List<String> tagNames) {
|
||||
long ret = 0;
|
||||
for (String tagName : tagNames) {
|
||||
ret |= encodeTagName(tagName);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public String decodeTagName(long encodedTag) {
|
||||
if (encodedTag == 0)
|
||||
return "?";
|
||||
return tagName(Long.numberOfTrailingZeros(encodedTag));
|
||||
}
|
||||
|
||||
public String tagName(int tagId) {
|
||||
if (tagId < 0 || tagId >= tagNames.size())
|
||||
return "?";
|
||||
return tagNames.get(tagId);
|
||||
}
|
||||
|
||||
public OptionalInt tagId(String tagName) {
|
||||
Integer id = tagDict.get(tagName);
|
||||
if (id == null)
|
||||
return OptionalInt.empty();
|
||||
return OptionalInt.of(id);
|
||||
}
|
||||
|
||||
public List<String> tags() {
|
||||
var ret = new ArrayList<>(tagDict.keySet());
|
||||
ret.sort(Comparator.naturalOrder());
|
||||
return ret;
|
||||
}
|
||||
|
||||
public IntList tagIdsForPrefix(String tagNamePrefix) {
|
||||
IntArrayList ret = new IntArrayList();
|
||||
tagDict.entrySet().stream()
|
||||
.filter(tag -> tag.getKey().startsWith(tagNamePrefix))
|
||||
.mapToInt(Map.Entry::getValue)
|
||||
.forEach(ret::add);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PosTaggingData{ tags=" + tagDict + '}';
|
||||
}
|
||||
}
|
@@ -1,17 +1,20 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.sentence.tag.HtmlStringTagger;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTaggedString;
|
||||
import nu.marginalia.language.stemming.Stemmer;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
@@ -29,12 +32,11 @@ import java.util.*;
|
||||
*/
|
||||
public class SentenceExtractor {
|
||||
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private SentenceDetectorME sentenceDetector;
|
||||
private static RDRPOSTagger rdrposTagger;
|
||||
|
||||
private static NgramLexicon ngramLexicon = null;
|
||||
|
||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||
|
||||
private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner();
|
||||
@@ -46,8 +48,10 @@ public class SentenceExtractor {
|
||||
static final int MAX_SENTENCE_COUNT = 1000;
|
||||
|
||||
@Inject
|
||||
public SentenceExtractor(LanguageModels models)
|
||||
public SentenceExtractor(LanguageConfiguration languageConfiguration, LanguageModels models)
|
||||
{
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
|
||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||
var sentenceModel = new SentenceModel(modelIn);
|
||||
sentenceDetector = new SentenceDetectorME(sentenceModel);
|
||||
@@ -61,21 +65,14 @@ public class SentenceExtractor {
|
||||
if (ngramLexicon == null) {
|
||||
ngramLexicon = new NgramLexicon(models);
|
||||
}
|
||||
|
||||
if (rdrposTagger == null) {
|
||||
try {
|
||||
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
|
||||
} catch (Exception ex) {
|
||||
throw new IllegalStateException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(Document doc) {
|
||||
public DocumentLanguageData extractSentences(Document doc) throws UnsupportedLanguageException {
|
||||
var language = languageConfiguration.identifyLanguage(doc).orElseThrow(UnsupportedLanguageException::new);
|
||||
|
||||
final List<DocumentSentence> textSentences = new ArrayList<>();
|
||||
|
||||
final List<HtmlTaggedString> taggedStrings = HtmlStringTagger.tagDocumentStrings(doc);
|
||||
|
||||
final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum();
|
||||
@@ -85,7 +82,7 @@ public class SentenceExtractor {
|
||||
String text = taggedString.string();
|
||||
|
||||
textSentences.addAll(
|
||||
extractSentencesFromString(text, taggedString.tags())
|
||||
extractSentencesFromString(language, text, taggedString.tags())
|
||||
);
|
||||
|
||||
if (documentText.isEmpty()) {
|
||||
@@ -96,32 +93,62 @@ public class SentenceExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
return new DocumentLanguageData(textSentences, documentText.toString());
|
||||
return new DocumentLanguageData(language, textSentences, documentText.toString());
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class));
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
|
||||
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
|
||||
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
|
||||
|
||||
var textSentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
|
||||
var titleSentences = extractSentencesFromString(language, title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
|
||||
|
||||
List<DocumentSentence> combined = new ArrayList<>(textSentences.size() + titleSentences.size());
|
||||
combined.addAll(titleSentences);
|
||||
combined.addAll(textSentences);
|
||||
|
||||
return new DocumentLanguageData(
|
||||
language,
|
||||
combined,
|
||||
text);
|
||||
}
|
||||
|
||||
public DocumentSentence extractSentence(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
|
||||
public DocumentLanguageData extractSentences(String text) {
|
||||
LanguageDefinition language = languageConfiguration.identifyLanguage(text, "en")
|
||||
.orElseThrow(() -> new RuntimeException("Language not found for default isoCode 'en'"));
|
||||
|
||||
var sentences = extractSentencesFromString(language, text, EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
return new DocumentLanguageData(language, sentences, text);
|
||||
}
|
||||
|
||||
|
||||
public DocumentSentence extractSentence(LanguageDefinition language,
|
||||
String text,
|
||||
EnumSet<HtmlTag> htmlTags) {
|
||||
final Stemmer stemmer = language.stemmer();
|
||||
|
||||
var wordsAndSeps = new SentenceSegmentSplitter(language).splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
|
||||
String[] words = wordsAndSeps.words();
|
||||
BitSet seps = wordsAndSeps.separators();
|
||||
String[] lc = new String[words.length];
|
||||
String[] stemmed = new String[words.length];
|
||||
long[] posTags = language.posTagSentence(words);
|
||||
|
||||
BitSet isCapitalized = new BitSet(words.length);
|
||||
BitSet isAllCaps = new BitSet(words.length);
|
||||
BitSet includeInStemming;
|
||||
|
||||
PosPattern inclusionPattern = stemmer.inclusionPatten();
|
||||
if (inclusionPattern == null) {
|
||||
includeInStemming = new BitSet(lc.length);
|
||||
includeInStemming.set(0, lc.length);
|
||||
}
|
||||
else {
|
||||
includeInStemming = inclusionPattern.matchTagPattern(posTags);
|
||||
}
|
||||
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
lc[i] = stripPossessive(words[i].toLowerCase());
|
||||
@@ -134,7 +161,7 @@ public class SentenceExtractor {
|
||||
}
|
||||
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(lc[i]);
|
||||
stemmed[i] = stemmer.stem(lc[i]);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
stemmed[i] = "NN"; // ???
|
||||
@@ -144,16 +171,18 @@ public class SentenceExtractor {
|
||||
return new DocumentSentence(
|
||||
seps,
|
||||
lc,
|
||||
rdrposTagger.tagsForEnSentence(words),
|
||||
posTags,
|
||||
stemmed,
|
||||
htmlTags,
|
||||
isCapitalized,
|
||||
isAllCaps
|
||||
isAllCaps,
|
||||
includeInStemming
|
||||
);
|
||||
}
|
||||
|
||||
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
String[] sentences;
|
||||
public List<DocumentSentence> extractSentencesFromString(LanguageDefinition language, String text, EnumSet<HtmlTag> htmlTags) {
|
||||
final Stemmer stemmer = language.stemmer();
|
||||
|
||||
|
||||
// Safety net against malformed data DOS attacks,
|
||||
// found 5+ MB <p>-tags in the wild that just break
|
||||
@@ -167,7 +196,7 @@ public class SentenceExtractor {
|
||||
text = normalizeSpaces(text);
|
||||
|
||||
// Split into sentences
|
||||
|
||||
String[] sentences;
|
||||
try {
|
||||
sentences = sentenceDetector.sentDetect(text);
|
||||
}
|
||||
@@ -189,22 +218,34 @@ public class SentenceExtractor {
|
||||
|
||||
List<DocumentSentence> ret = new ArrayList<>(sentences.length);
|
||||
|
||||
SentenceSegmentSplitter sentenceSegmentSplitter = new SentenceSegmentSplitter(language);
|
||||
|
||||
if (isNaturalLanguage) {
|
||||
// Natural language text; do POS tagging and stemming
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = rdrposTagger.tagsForEnSentence(tokens);
|
||||
var posTags = language.posTagSentence(tokens);
|
||||
var tokensLc = new String[tokens.length];
|
||||
var stemmed = new String[tokens.length];
|
||||
|
||||
BitSet isCapitalized = new BitSet(tokens.length);
|
||||
BitSet isAllCaps = new BitSet(tokens.length);
|
||||
BitSet includeInStemming;
|
||||
|
||||
PosPattern inclusionPattern = stemmer.inclusionPatten();
|
||||
if (inclusionPattern == null) {
|
||||
includeInStemming = new BitSet(tokens.length);
|
||||
includeInStemming.set(0, tokens.length);
|
||||
}
|
||||
else {
|
||||
includeInStemming = inclusionPattern.matchTagPattern(posTags);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
if (!tokens[i].isEmpty() && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
isCapitalized.set(i);
|
||||
}
|
||||
if (StringUtils.isAllUpperCase(tokens[i])) {
|
||||
@@ -221,13 +262,13 @@ public class SentenceExtractor {
|
||||
}
|
||||
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(tokens[i]);
|
||||
stemmed[i] = stemmer.stem(tokens[i]);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
stemmed[i] = "NN"; // ???
|
||||
}
|
||||
}
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps));
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps, includeInStemming));
|
||||
}
|
||||
}
|
||||
else {
|
||||
@@ -235,21 +276,22 @@ public class SentenceExtractor {
|
||||
// as this is not likely to be useful
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = new String[tokens.length];
|
||||
Arrays.fill(posTags, "X"); // Placeholder POS tag
|
||||
var posTags = new long[tokens.length];
|
||||
var tokensLc = new String[tokens.length];
|
||||
var stemmed = new String[tokens.length];
|
||||
|
||||
BitSet isCapitalized = new BitSet(tokens.length);
|
||||
BitSet isAllCaps = new BitSet(tokens.length);
|
||||
BitSet includeInStemming = new BitSet(tokens.length);
|
||||
includeInStemming.set(0, tokens.length);
|
||||
|
||||
for (int i = 0; i < tokensLc.length; i++) {
|
||||
var originalVal = tokens[i];
|
||||
|
||||
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
if (!tokens[i].isEmpty() && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
isCapitalized.set(i);
|
||||
}
|
||||
if (StringUtils.isAllUpperCase(tokens[i])) {
|
||||
@@ -264,7 +306,7 @@ public class SentenceExtractor {
|
||||
stemmed[i] = tokensLc[i]; // we don't stem non-language words
|
||||
}
|
||||
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized));
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized, includeInStemming));
|
||||
}
|
||||
|
||||
}
|
@@ -2,7 +2,8 @@ package nu.marginalia.language.sentence;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
@@ -13,10 +14,11 @@ import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;
|
||||
|
||||
public class SentenceSegmentSplitter {
|
||||
|
||||
private final UnicodeNormalization unicodeNormalization;
|
||||
|
||||
public record SeparatedSentence(String[] words, BitSet separators) { }
|
||||
|
||||
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
|
||||
|
||||
private static final Pattern wordBreakPattern;
|
||||
|
||||
static {
|
||||
@@ -31,13 +33,17 @@ public class SentenceSegmentSplitter {
|
||||
}
|
||||
}
|
||||
|
||||
SentenceSegmentSplitter(LanguageDefinition languageDefinition) {
|
||||
this.unicodeNormalization = languageDefinition.unicodeNormalization();
|
||||
}
|
||||
|
||||
/** Split a sentence into words and separators.
|
||||
*
|
||||
* @param segment The sentence to split
|
||||
* @return A list of words and separators
|
||||
*/
|
||||
public static SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = AsciiFlattener.flattenUnicode(segment);
|
||||
public SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = unicodeNormalization.flattenUnicode(segment);
|
||||
|
||||
var matcher = wordBreakPattern.matcher(flatSegment);
|
||||
|
@@ -3,14 +3,15 @@ package nu.marginalia.language.sentence;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class ThreadLocalSentenceExtractorProvider {
|
||||
private final ThreadLocal<SentenceExtractor> sentenceExtractorThreadLocal;
|
||||
|
||||
@Inject
|
||||
public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) {
|
||||
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels));
|
||||
public ThreadLocalSentenceExtractorProvider(LanguageConfiguration languageConfiguration, LanguageModels languageModels) {
|
||||
sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageConfiguration, languageModels));
|
||||
}
|
||||
|
||||
public SentenceExtractor get() {
|
@@ -0,0 +1,68 @@
|
||||
package nu.marginalia.language.stemming;
|
||||
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import opennlp.tools.stemmer.snowball.SnowballStemmer;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public sealed interface Stemmer {
|
||||
String stem(String input);
|
||||
@Nullable PosPattern inclusionPatten();
|
||||
|
||||
final class Porter implements Stemmer {
|
||||
private static final ca.rmen.porterstemmer.PorterStemmer porterStemmerImpl = new ca.rmen.porterstemmer.PorterStemmer();
|
||||
@Nullable
|
||||
private final PosPattern inclusionPattern;
|
||||
|
||||
public Porter(@Nullable PosPattern inclusionPattern) {
|
||||
this.inclusionPattern = inclusionPattern;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public PosPattern inclusionPatten() {
|
||||
return inclusionPattern;
|
||||
}
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
return porterStemmerImpl.stemWord(input);
|
||||
}
|
||||
}
|
||||
|
||||
final class Snowball implements Stemmer {
|
||||
private final SnowballStemmer snowballStemmer;
|
||||
@Nullable
|
||||
private final PosPattern inclusionPattern;
|
||||
|
||||
public Snowball(String algorithmName, @Nullable PosPattern inclusionPattern) {
|
||||
this.inclusionPattern = inclusionPattern;
|
||||
|
||||
SnowballStemmer.ALGORITHM algorithm = SnowballStemmer.ALGORITHM.valueOf(algorithmName.toUpperCase());
|
||||
snowballStemmer = new SnowballStemmer(algorithm);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public PosPattern inclusionPatten() {
|
||||
return inclusionPattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
// Snowball impl declares return value as CharSequence,
|
||||
// but in practice always returns a String
|
||||
return (String) snowballStemmer.stem(input);
|
||||
}
|
||||
}
|
||||
|
||||
final class NoOpStemmer implements Stemmer {
|
||||
|
||||
@Nullable
|
||||
public PosPattern inclusionPatten() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String stem(String input) {
|
||||
return input;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.*;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
@@ -11,7 +10,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
public class NgramExtractorMain {
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
@@ -112,50 +111,45 @@ public class NgramExtractorMain {
|
||||
|
||||
var orderedHasher = HasherGroup.ordered();
|
||||
|
||||
var pool = new SimpleBlockingThreadPool("ngram-extractor",
|
||||
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
|
||||
32
|
||||
);
|
||||
try (var pool = new ForkJoinPool(Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32))) {
|
||||
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submit(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submit(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
}, p -> true);
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
pool.shutDown();
|
||||
pool.awaitTermination(10, TimeUnit.DAYS);
|
||||
}, p -> true);
|
||||
}
|
||||
|
||||
lexicon.saveCounts(countsOutputFile);
|
||||
}
|
@@ -5,16 +5,19 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** Dictionary with term frequency information for (stemmed) words.
|
||||
*
|
||||
@@ -38,15 +41,23 @@ public class TermFrequencyDict {
|
||||
}
|
||||
|
||||
private static Long2IntOpenHashMap load(Path file) throws IOException {
|
||||
try (LongArray array = LongArrayFactory.mmapForReadingConfined(file)) {
|
||||
try (Arena arena = Arena.ofConfined();
|
||||
FileChannel fileChannel = (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ)) {
|
||||
|
||||
int size = (int) Files.size(file) / 16;
|
||||
long fileSizeBytes = Files.size(file);
|
||||
MemorySegment mappedFile = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSizeBytes, arena);
|
||||
|
||||
int size = (int) fileSizeBytes / 16;
|
||||
var ret = new Long2IntOpenHashMap(size, 0.5f);
|
||||
|
||||
ret.defaultReturnValue(0);
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
ret.put(array.get(2 * i), (int) array.get(2 * i + 1));
|
||||
|
||||
long key = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i);
|
||||
long val = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i + 1);
|
||||
|
||||
ret.put(key, (int) val);
|
||||
}
|
||||
|
||||
return ret;
|
31
code/functions/language-processing/readme.md
Normal file
31
code/functions/language-processing/readme.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Language Processing
|
||||
|
||||
This function gathers various tools used in language processing,
|
||||
keyword extraction, and so on.
|
||||
|
||||
## Language Configuration
|
||||
|
||||
The files [resources/languages-default.xml](resources/languages-default.xml) and [resources/languages-experimental.xml](resources/languages-experimental.xml) hold the laguage definitions used by the search engine,
|
||||
the former is used in production and the latter in most tests that require language processing.
|
||||
|
||||
The search engine excludes any languages not configured in these files, though it is relatively easy to define a stub
|
||||
configuration that gets a simpler behavior out of the search engine.
|
||||
|
||||
## Language Processing Tool
|
||||
|
||||
It also houses a tool for inspecting the output of keyword extraction,
|
||||
which can be accessed by running the command below from the root of the project.
|
||||
The tool becomes accessible on port 8080.
|
||||
|
||||
```bash
|
||||
$ ./gradlew :code:functions:language-processing:run
|
||||
```
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [SentenceExtractor](java/nu/marginalia/language/sentence/SentenceExtractor.java) -
|
||||
Creates a [DocumentLanguageData](java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
|
||||
its words, how they stem, POS tags, and so on.
|
||||
* [LanguageConfiguration](java/nu/marginalia/language/config/LanguageConfiguration.java) - parses langauge configuration xml files into LanguageDefinition objects
|
||||
* [LanguageDefinition](java/nu/marginalia/language/model/LanguageDefinition.java) - holds all per-language cusotmizations that are fed into the language processing pipeline
|
||||
* [DocumentKeywordExtractor](java/nu/marginalia/keyword/DocumentKeywordExtractor.java) - extracts keywords from documents
|
@@ -0,0 +1,109 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
id ID #REQUIRED
|
||||
md5 CDATA #REQUIRED
|
||||
path CDATA #REQUIRED
|
||||
href CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ATTLIST language
|
||||
isoCode ID #REQUIRED
|
||||
name CDATA #REQUIRED
|
||||
display (rtl|ltr) #REQUIRED
|
||||
disabled (true|false) "false"
|
||||
>
|
||||
|
||||
<!ELEMENT unicodeNormalization EMPTY>
|
||||
<!ATTLIST unicodeNormalization
|
||||
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT stemmer (pospattern?)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
variant CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT keywordHash (#PCDATA)>
|
||||
<!ATTLIST keywordHash
|
||||
algorithm (asciish|utf8) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT rdrTagger EMPTY>
|
||||
<!ATTLIST rdrTagger
|
||||
dictId IDREF #REQUIRED
|
||||
rdrId IDREF #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT ngrams (pospattern*)>
|
||||
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
|
||||
|
||||
<!ELEMENT pospattern (#PCDATA)>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
|
||||
]>
|
||||
|
||||
<languages>
|
||||
<language isoCode="en" name="English" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="porter">
|
||||
<pospattern>!(IN TO CC DT)</pospattern>
|
||||
</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="maximal-latin" />
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
<ngrams type="name">
|
||||
<pospattern>NNP*</pospattern>
|
||||
<pospattern>NNP* NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="noun">
|
||||
<pospattern>VBG</pospattern>
|
||||
<pospattern>RB VBG</pospattern>
|
||||
<pospattern>(NNP* JJ)</pospattern>
|
||||
<pospattern>(NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="subject-suffix">
|
||||
<pospattern>(VBD VBZ)</pospattern>
|
||||
<pospattern>MD VB</pospattern>
|
||||
<pospattern>VBZ DT</pospattern>
|
||||
<pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="title">
|
||||
<pospattern>!(CC IN DT TO)</pospattern>
|
||||
<pospattern>!CC !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * * !(IN DT TO)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="keyword">
|
||||
<!-- length = 1 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
|
||||
<!-- length = 2 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 3 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 4 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
</language>
|
||||
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
|
||||
</languages>
|
@@ -0,0 +1,135 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
id ID #REQUIRED
|
||||
md5 CDATA #REQUIRED
|
||||
path CDATA #REQUIRED
|
||||
href CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ATTLIST language
|
||||
isoCode ID #REQUIRED
|
||||
name CDATA #REQUIRED
|
||||
display (rtl|ltr) #REQUIRED
|
||||
disabled (true|false) "false"
|
||||
>
|
||||
|
||||
<!ELEMENT unicodeNormalization EMPTY>
|
||||
<!ATTLIST unicodeNormalization
|
||||
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT stemmer (pospattern?)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
variant CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT keywordHash (#PCDATA)>
|
||||
<!ATTLIST keywordHash
|
||||
algorithm (asciish|utf8) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT rdrTagger EMPTY>
|
||||
<!ATTLIST rdrTagger
|
||||
dictId IDREF #REQUIRED
|
||||
rdrId IDREF #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT ngrams (pospattern*)>
|
||||
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
|
||||
|
||||
<!ELEMENT pospattern (#PCDATA)>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
|
||||
]>
|
||||
|
||||
<languages>
|
||||
<language isoCode="en" name="English" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="porter">
|
||||
<pospattern>!(IN TO CC DT)</pospattern>
|
||||
</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="maximal-latin" />
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
<ngrams type="name">
|
||||
<pospattern>NNP*</pospattern>
|
||||
<pospattern>NNP* NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="noun">
|
||||
<pospattern>VBG</pospattern>
|
||||
<pospattern>RB VBG</pospattern>
|
||||
<pospattern>(NNP* JJ)</pospattern>
|
||||
<pospattern>(NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="subject-suffix">
|
||||
<pospattern>(VBD VBZ)</pospattern>
|
||||
<pospattern>MD VB</pospattern>
|
||||
<pospattern>VBZ DT</pospattern>
|
||||
<pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="title">
|
||||
<pospattern>!(CC IN DT TO)</pospattern>
|
||||
<pospattern>!CC !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * * !(IN DT TO)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="keyword">
|
||||
<!-- length = 1 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
|
||||
<!-- length = 2 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 3 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 4 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
|
||||
</language>
|
||||
<language isoCode="sv" name="Swedish" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="SWEDISH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="e-accents" />
|
||||
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
|
||||
<ngrams type="name">
|
||||
<pospattern>PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN</pospattern>
|
||||
<pospattern>PROPN PROPN PROPN PROPN</pospattern>
|
||||
</ngrams>
|
||||
</language>
|
||||
<language isoCode="fr" name="French" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
</language>
|
||||
<language isoCode="de" name="German" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="GERMAN" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="german" />
|
||||
</language>
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
<resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />
|
||||
<resource id="pos-rdr-sv" md5="" path="rdr/Swedish.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.RDR" />
|
||||
|
||||
</languages>
|
@@ -0,0 +1,277 @@
|
||||
@import it.unimi.dsi.fastutil.ints.IntList
|
||||
@import nu.marginalia.language.model.WordRep
|
||||
@import nu.marginalia.language.model.DocumentSentence
|
||||
@import nu.marginalia.language.model.LanguageDefinition
|
||||
@import java.util.*
|
||||
@import java.util.stream.Collectors
|
||||
@import java.util.stream.IntStream
|
||||
|
||||
@param String textSample
|
||||
@param LanguageDefinition language
|
||||
@param List<DocumentSentence> sentences
|
||||
@param Map<Long, String> tagColors
|
||||
@param Collection<WordRep> tfIdfReps
|
||||
@param Collection<WordRep> titleReps
|
||||
@param Collection<WordRep> nameLikeReps
|
||||
@param Collection<WordRep> subjectLikeReps
|
||||
@param Collection<String> artifacts
|
||||
@param Collection<String> importantWords
|
||||
@param Map<String, IntList> positionedWords
|
||||
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>NLP Debug Tool</title>
|
||||
<script src="https://cdn.tailwindcss.com"></script>
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
||||
<style>
|
||||
.sentence-boundary {
|
||||
border-left: 3px solid #3b82f6;
|
||||
}
|
||||
ruby rt {
|
||||
font-size: 0.65em;
|
||||
color: #6b7280;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body class="bg-gray-50 min-h-screen">
|
||||
<div class="container mx-auto px-4 py-8 max-w-6xl">
|
||||
<!-- Header -->
|
||||
<div class="mb-8">
|
||||
<h1 class="text-3xl font-bold text-gray-900 mb-2">
|
||||
<i class="fas fa-microscope text-blue-600 mr-3"></i>
|
||||
Language Processing Debug Tool
|
||||
</h1>
|
||||
<p class="text-gray-600">Inspect and debug text processing pipeline components</p>
|
||||
</div>
|
||||
|
||||
<!-- Input Section -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200 mb-6">
|
||||
<form method="post">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900 mb-3">
|
||||
<i class="fas fa-edit text-green-600 mr-2"></i>
|
||||
Input Text
|
||||
</h2>
|
||||
|
||||
<textarea name="textSample"
|
||||
class="w-full p-4 border border-gray-300 rounded-md focus:ring-2 focus:ring-blue-500 focus:border-blue-500 resize-none"
|
||||
rows="4"
|
||||
placeholder="Enter your text here to analyze...">${textSample}</textarea>
|
||||
<div class="flex justify-between items-center mt-3">
|
||||
<button class="px-4 py-2 bg-blue-600 text-white rounded-md hover:bg-blue-700 transition-colors">
|
||||
<i class="fas fa-cog mr-2"></i>Analyze
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<!-- Results Grid -->
|
||||
<div class="space-y-6">
|
||||
|
||||
<!-- Sentence Breakdown with POS Tags -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900">
|
||||
<i class="fas fa-list-ol text-purple-600 mr-2"></i>
|
||||
Sentence Breakdown & POS Tags
|
||||
</h2>
|
||||
@if (language != null)
|
||||
<div class="text-sm text-gray-500 mt-1">Auto-detected: ${language.displayName()} (${language.isoCode()})</div>
|
||||
@endif
|
||||
</div>
|
||||
@if (sentences != null)
|
||||
@for (DocumentSentence sentence : sentences)
|
||||
<div class="p-4 space-y-4">
|
||||
<div class="sentence-boundary pl-4 py-4 rounded">
|
||||
@for (int pos : IntStream.range(0, sentence.length()).toArray())
|
||||
<ruby class="p-4">
|
||||
@if (language.hasPosParsing())
|
||||
<span class="text-xl font-serif ${tagColors.get(sentence.posTags[pos])}">
|
||||
${sentence.wordsLowerCase[pos]}
|
||||
</span>
|
||||
<rt>
|
||||
${language.decodePosTagName(sentence.posTags[pos])}
|
||||
|
||||
@if (sentence.isAllCaps(pos))
|
||||
<i class="fa-solid fa-angles-up"></i>
|
||||
@elseif (sentence.isCapitalized(pos))
|
||||
<i class="fa-solid fa-arrow-up"></i>
|
||||
@endif
|
||||
</rt>
|
||||
@else <!-- pos tags disabled -->
|
||||
<span class="text-xl font-serif">
|
||||
${sentence.wordsLowerCase[pos]}
|
||||
</span>
|
||||
<rt>
|
||||
@if (sentence.isAllCaps(pos))
|
||||
<i class="fa-solid fa-angles-up"></i>
|
||||
@elseif (sentence.isCapitalized(pos))
|
||||
<i class="fa-solid fa-arrow-up"></i>
|
||||
@endif
|
||||
</rt>
|
||||
@endif
|
||||
</ruby>
|
||||
@if (sentence.isSeparatorComma(pos))
|
||||
<i class="fa-regular fa-circle"></i>
|
||||
@endif
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endfor
|
||||
@endif
|
||||
</div>
|
||||
|
||||
<!-- Keywords & N-grams -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900">
|
||||
<i class="fas fa-key text-indigo-600 mr-2"></i>
|
||||
Keywords & N-grams
|
||||
</h2>
|
||||
</div>
|
||||
<div class="p-4">
|
||||
<div class="grid grid-cols-1 md:grid-cols-3 gap-6">
|
||||
<!-- Keywords -->
|
||||
@if (tfIdfReps != null && !tfIdfReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Keywords (TF-IDF)
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : tfIdfReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
@if (rep.length > 1)
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (nameLikeReps != null && !nameLikeReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Name-Like
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : nameLikeReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
@if (rep.length > 1)
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (subjectLikeReps != null && !subjectLikeReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Subject-Like
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : subjectLikeReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
@if (rep.length > 1)
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (titleReps != null && !titleReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Title
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : titleReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
@if (rep.length > 1)
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
@endif
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (artifacts != null && !artifacts.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Artifacts
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : artifacts)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
|
||||
@if (importantWords != null && !importantWords.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Important Words
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : importantWords)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Full simulation outcome from keyword extraction -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900">
|
||||
<i class="fas fa-list-ol text-purple-600 mr-2"></i>
|
||||
Outcome
|
||||
</h2>
|
||||
</div>
|
||||
<div class="p-4">
|
||||
@if (positionedWords != null && !positionedWords.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Positioned Words
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : positionedWords.keySet())
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${positionedWords.get(word).stream().map(Object::toString).collect(Collectors.joining(", "))}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
@@ -1,19 +1,23 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
@@ -23,10 +27,16 @@ import java.util.Set;
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
|
||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
static SentenceExtractor se;
|
||||
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||
public void testKeyboards2() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
@@ -44,7 +54,7 @@ class DocumentKeywordExtractorTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testMadonna() throws IOException, URISyntaxException {
|
||||
public void testMadonna() throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
@@ -56,19 +66,19 @@ class DocumentKeywordExtractorTest {
|
||||
new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
|
||||
);
|
||||
|
||||
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
|
||||
var keywordsBuilt = keywords.build();
|
||||
|
||||
Map<String, Byte> flags = new HashMap<>();
|
||||
Map<String, CodedSequence> positions = new HashMap<>();
|
||||
|
||||
for (int i = 0; i < keywordsBuilt.size(); i++) {
|
||||
String keyword = keywordsBuilt.keywords.get(i);
|
||||
byte metadata = keywordsBuilt.metadata[i]
|
||||
String keyword = keywordsBuilt.keywords().get(i);
|
||||
byte metadata = keywordsBuilt.metadata()[i]
|
||||
;
|
||||
|
||||
if (Set.of("dirty", "blues").contains(keyword)) {
|
||||
flags.put(keyword, metadata);
|
||||
positions.put(keyword, keywordsBuilt.positions.get(i));
|
||||
positions.put(keyword, keywordsBuilt.positions().get(i));
|
||||
|
||||
}
|
||||
}
|
||||
@@ -81,17 +91,4 @@ class DocumentKeywordExtractorTest {
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSpam() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
}
|
||||
}
|
@@ -5,14 +5,23 @@ import gnu.trove.list.array.TIntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
@@ -20,8 +29,21 @@ import java.util.List;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class DocumentPositionMapperTest {
|
||||
private final DocumentPositionMapper positionMapper = new DocumentPositionMapper();
|
||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
private static LanguageDefinition english;
|
||||
private DocumentPositionMapper positionMapper;
|
||||
static SentenceExtractor se;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
|
||||
english = config.getLanguage("en");
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
positionMapper = new DocumentPositionMapper();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWordPattern() {
|
||||
@@ -43,8 +65,8 @@ class DocumentPositionMapperTest {
|
||||
@Test
|
||||
public void testBasic() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
DocumentLanguageData dld = new DocumentLanguageData(
|
||||
se.extractSentencesFromString("I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
|
||||
DocumentLanguageData dld = new DocumentLanguageData(english,
|
||||
se.extractSentencesFromString(english, "I am a teapot, short and stout", EnumSet.of(HtmlTag.CODE)),
|
||||
"I am a teapot"
|
||||
);
|
||||
|
||||
@@ -72,7 +94,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksSingleWord1Rep() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 1 });
|
||||
|
||||
@@ -93,7 +115,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksSingleWord2Reps() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 4 }); // This will become 2 repetitions, formula is ~ sqrt(counts)
|
||||
|
||||
@@ -105,7 +127,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
||||
@@ -121,7 +143,7 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksTwoWords2Reps() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences = se.extractSentencesFromString("Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences = se.extractSentencesFromString(english, "Zelda II", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 4 });
|
||||
|
||||
@@ -134,7 +156,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
||||
@@ -151,8 +173,8 @@ class DocumentPositionMapperTest {
|
||||
public void testLinksTwoSent1Word1Rep() {
|
||||
DocumentKeywordsBuilder keywordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
var sentences1 = se.extractSentencesFromString("Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences2 = se.extractSentencesFromString("Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences1 = se.extractSentencesFromString(english, "Zelda", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
var sentences2 = se.extractSentencesFromString(english, "Link", EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT));
|
||||
assertEquals(1, sentences1.size());
|
||||
assertEquals(1, sentences2.size());
|
||||
TIntList counts = new TIntArrayList(new int[] { 1, 1 });
|
||||
@@ -170,7 +192,7 @@ class DocumentPositionMapperTest {
|
||||
var linkTextSpans = keywordsBuilder.wordSpans.get(HtmlTag.EXTERNAL_LINKTEXT);
|
||||
assertEquals(2, linkTextSpans.size());
|
||||
|
||||
DocumentKeywordsBuilder.DocumentWordSpan span;
|
||||
DocumentWordSpan span;
|
||||
span = linkTextSpans.get(0);
|
||||
|
||||
assertEquals(6, span.start());
|
@@ -2,15 +2,22 @@ package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
@@ -23,9 +30,19 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
class SentenceExtractorTest {
|
||||
static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
static SentenceExtractor se = new SentenceExtractor(lm);
|
||||
static SentenceExtractor se;
|
||||
private static LanguageDefinition english;
|
||||
|
||||
public static void main(String... args) throws IOException, URISyntaxException {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
|
||||
english = config.getLanguage("en");
|
||||
|
||||
}
|
||||
|
||||
public static void main(String... args) throws IOException, URISyntaxException, UnsupportedLanguageException {
|
||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
var data = WmsaHome.getHomePath().resolve("test-data/");
|
||||
@@ -58,7 +75,7 @@ class SentenceExtractorTest {
|
||||
|
||||
@Test
|
||||
public void testACDC() {
|
||||
var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
|
||||
var ret = se.extractSentence(english, "AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals("ac/dc", ret.wordsLowerCase[0]);
|
||||
}
|
||||
|
@@ -0,0 +1,28 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class ArtifactKeywordsTest {
|
||||
|
||||
@Test
|
||||
public void testExtractArtifacts() throws IOException, ParserConfigurationException, SAXException {
|
||||
SentenceExtractor se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental()), TestLanguageModels.getLanguageModels());
|
||||
|
||||
var artifacts = new ArtifactKeywords(se.extractSentences("Hello I'm <vlofgren@marginalia.nu>, what's up?", "hello!"));
|
||||
System.out.println(artifacts.getWords());
|
||||
assertTrue(artifacts.getWords().contains("vlofgren"));
|
||||
assertTrue(artifacts.getWords().contains("marginalia.nu"));
|
||||
assertTrue(artifacts.getWords().contains("@marginalia.nu"));
|
||||
assertTrue(artifacts.getWords().contains("vlofgren@marginalia.nu"));
|
||||
}
|
||||
}
|
@@ -2,13 +2,18 @@ package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Collections;
|
||||
@@ -44,37 +49,43 @@ class NameLikeKeywordsTest {
|
||||
later known as Augustus, rose to sole power after defeating his opponents in the last civil war of
|
||||
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
|
||||
""";
|
||||
static SentenceExtractor se;
|
||||
static LanguageConfiguration lc;
|
||||
static LanguageDefinition en;
|
||||
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
|
||||
lc = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
en = lc.getLanguage("en");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
SentenceExtractor se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
NameLikeKeywords keywords = new NameLikeKeywords(new KeywordExtractor(), se.extractSentences(text, "Julius Caesar"), 2);
|
||||
NameLikeKeywords keywords = new NameLikeKeywords(se.extractSentences(text, "Julius Caesar"), 2);
|
||||
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
Set<String> expected = Set.of("caesar", "senate", "roman", "republic", "roman_republic");
|
||||
|
||||
// rome isn't counted because PorterStemmer is derp
|
||||
|
||||
System.out.println(actual);
|
||||
System.out.println(expected);
|
||||
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWikiArticle() throws IOException {
|
||||
public void testWikiArticle() throws IOException, UnsupportedLanguageException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0));
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var ke = new KeywordExtractor();
|
||||
|
||||
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
|
||||
var nameWords = new NameLikeKeywords(se.extractSentences(doc), 2);
|
||||
System.out.println("Names: " + nameWords.words());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWikiArticleP1() {
|
||||
public void testWikiArticleP1() throws UnsupportedLanguageException {
|
||||
String html = """
|
||||
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for client–server web applications, with a reported 9 million developers.</p>
|
||||
<p>Java was originally developed by James Gosling at Sun Microsystems. It was released in May 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GPL-2.0-only license. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open-source software and used by most developers and is the default JVM for almost all Linux distributions.</p>
|
||||
@@ -82,11 +93,7 @@ class NameLikeKeywordsTest {
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0));
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var ke = new KeywordExtractor();
|
||||
|
||||
var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2);
|
||||
var nameWords = new NameLikeKeywords(se.extractSentences(doc), 2);
|
||||
System.out.println("Names: " + nameWords.words());
|
||||
}
|
||||
}
|
@@ -1,12 +1,17 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
@@ -41,21 +46,28 @@ class SubjectLikeKeywordsTest {
|
||||
the Roman Republic. Octavian set about solidifying his power, and the era of the Roman Empire began.
|
||||
""";
|
||||
|
||||
static SentenceExtractor se;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
var lm = TestLanguageModels.getLanguageModels();
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
var dld = se.extractSentences(text, "Julius Caesar");
|
||||
|
||||
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, new KeywordExtractor(), dld);
|
||||
SubjectLikeKeywords keywords = new SubjectLikeKeywords(new KeywordExtractor(),
|
||||
tfIdfCounts,
|
||||
dld);
|
||||
WordsTfIdfCounts tfIdfCounts = new WordsTfIdfCounts(dict, dld);
|
||||
SubjectLikeKeywords keywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
||||
|
||||
Set<String> actual = keywords.getReps().stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
Set<String> expected = Set.of("republic", "authoritarian_reforms", "political_alliance_that", "power_as_populares", "caesar", "reforms", "populares", "senate", "sole_power_after", "pompey", "civil_wars", "wars", "governmental_reforms", "government_of_the_republic");
|
||||
Set<String> expected = Set.of("populares", "republic", "authoritarian_reforms", "senate", "pompey", "civil_wars", "octavian", "caesar");
|
||||
|
||||
System.out.println(actual);
|
||||
System.out.println(expected);
|
||||
|
||||
assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected));
|
||||
}
|
@@ -1,13 +1,18 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -187,12 +192,13 @@ class TitleKeywordsTest {
|
||||
""";
|
||||
|
||||
@Test
|
||||
public void extractTitleWords() {
|
||||
var se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
public void extractTitleWords() throws IOException, ParserConfigurationException, SAXException, UnsupportedLanguageException {
|
||||
var languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
var se = new SentenceExtractor(languageConfiguration, TestLanguageModels.getLanguageModels());
|
||||
|
||||
var dld = se.extractSentences(Jsoup.parse(document));
|
||||
|
||||
var reps = new TitleKeywords(new KeywordExtractor(), dld).getReps();
|
||||
var reps = new TitleKeywords(dld).getReps();
|
||||
var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
|
||||
Set<String> expected = Set.of(
|
@@ -0,0 +1,77 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import nu.marginalia.language.filter.TestLanguageModels;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class LanguageConfigurationTestFile {
|
||||
private static LanguageConfiguration languageConfiguration;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, SAXException, ParserConfigurationException {
|
||||
languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBasic() {
|
||||
Assertions.assertNotNull(languageConfiguration.getLanguage("en"));
|
||||
Assertions.assertNotNull(languageConfiguration.getLanguage("sv"));
|
||||
Assertions.assertNull(languageConfiguration.getLanguage("!!"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStemming() {
|
||||
var svStemmer = languageConfiguration.getLanguage("sv").stemmer();
|
||||
var enStemmer = languageConfiguration.getLanguage("en").stemmer();
|
||||
|
||||
Assertions.assertNotNull(svStemmer);
|
||||
Assertions.assertNotNull(enStemmer);
|
||||
|
||||
assertEquals("bil", svStemmer.stem("bilar"));
|
||||
assertEquals("dogged", svStemmer.stem("dogged"));
|
||||
assertEquals("bilar", enStemmer.stem("bilar"));
|
||||
assertEquals("dog", enStemmer.stem("dogged"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPosData() {
|
||||
var svPos = languageConfiguration.getLanguage("sv").posTagger();
|
||||
var enPos = languageConfiguration.getLanguage("en").posTagger();
|
||||
|
||||
Assertions.assertNotNull(svPos);
|
||||
Assertions.assertNotNull(enPos);
|
||||
|
||||
System.out.println(enPos);
|
||||
System.out.println(svPos);
|
||||
|
||||
Assertions.assertNotEquals(svPos.tagDict, enPos.tagDict);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPosPattern() {
|
||||
var enPos = languageConfiguration.getLanguage("en").posTagger();
|
||||
|
||||
System.out.println(new PosPattern(enPos, "NNP").pattern);
|
||||
System.out.println(new PosPattern(enPos, "NNP").pattern);
|
||||
System.out.println(new PosPattern(enPos, "NNP NNPS").pattern);
|
||||
System.out.println(new PosPattern(enPos, "NNPS (NNPS DT) DT").pattern);
|
||||
System.out.println(new PosPattern(enPos,
|
||||
"(NNP NNPS) (NNP NNPS IN DT CC) (NNP NNPS IN DT CC) (NNP NNPS)").pattern);
|
||||
|
||||
assertEquals(new PosPattern(enPos, "NNP*").pattern,
|
||||
new PosPattern(enPos, "(NNP NNPS)").pattern);
|
||||
assertEquals(LongList.of(0L), new PosPattern(enPos, "Hello").pattern);
|
||||
assertEquals(0, (new PosPattern(enPos, "(NNP NNPS)").pattern.getFirst() & new PosPattern(enPos, "!(NNP NNPS)").pattern.getFirst()));
|
||||
assertEquals(new PosPattern(enPos, "(NNP NNPS)").pattern.getFirst().longValue(), new PosPattern(enPos, "*").pattern.getFirst() ^ new PosPattern(enPos, "!(NNP NNPS)").pattern.getFirst());
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,41 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
|
||||
class UnicodeNormalizationTest {
|
||||
|
||||
UnicodeNormalization unicodeNormalization = new UnicodeNormalization.FlattenAllLatin();
|
||||
|
||||
@Test
|
||||
void flattenUnicodePlainAscii() {
|
||||
String s = "abc";
|
||||
|
||||
// If the string is ascii, we don't want to allocate a copy
|
||||
|
||||
assertSame(s, unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void flattenUnicode() {
|
||||
String s = "Stülpnagelstraße";
|
||||
|
||||
assertEquals("Stulpnagelstrasse", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void flattenUnicode2() {
|
||||
String s = "Koncevičius";
|
||||
|
||||
assertEquals("Koncevicius", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void omitNonFlattenable() {
|
||||
String s = "[アグレッシブ烈子]";
|
||||
|
||||
assertEquals("[]", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
}
|
@@ -0,0 +1,280 @@
|
||||
package nu.marginalia.language.pos;
|
||||
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
||||
class PosPatternTest {
|
||||
final List<String> allTags = List.of("A1", "B1", "C1");
|
||||
final PosTagger posTagger = new PosTagger("en", allTags);
|
||||
|
||||
@Test
|
||||
void matchSentence__singleTermPattern() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1");
|
||||
List<WordSpan> ret = new ArrayList<>();
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern("A1", "A1", "A1");
|
||||
|
||||
int returnCount = pattern.matchSentence(sentence, ret);
|
||||
|
||||
List<WordSpan> expected = List.of(
|
||||
new WordSpan(0, 1),
|
||||
new WordSpan(1, 2),
|
||||
new WordSpan(2, 3)
|
||||
);
|
||||
|
||||
System.out.println(ret);
|
||||
System.out.println(expected);
|
||||
|
||||
Assertions.assertEquals(expected, ret);
|
||||
Assertions.assertEquals(ret.size(), returnCount);
|
||||
}
|
||||
|
||||
@Test
|
||||
void matchSentence__singleTermPattern_comma() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1");
|
||||
List<WordSpan> ret = new ArrayList<>();
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern(
|
||||
new String[] {"A1", "A1", "A1"},
|
||||
new boolean[] { true, false, true}
|
||||
);
|
||||
|
||||
int returnCount = pattern.matchSentence(sentence, ret);
|
||||
|
||||
List<WordSpan> expected = List.of(
|
||||
new WordSpan(0, 1),
|
||||
new WordSpan(1, 2),
|
||||
new WordSpan(2, 3)
|
||||
);
|
||||
|
||||
System.out.println(ret);
|
||||
System.out.println(expected);
|
||||
|
||||
Assertions.assertEquals(expected, ret);
|
||||
Assertions.assertEquals(ret.size(), returnCount);
|
||||
}
|
||||
|
||||
@Test
|
||||
void matchSentence__threeTermPattern() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1 B1 C1");
|
||||
List<WordSpan> ret = new ArrayList<>();
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern(
|
||||
new String[] {"A1", "B1", "C1", "A1", "B1", "C1"},
|
||||
new boolean[] { false, false, true, false, false, true }
|
||||
);
|
||||
|
||||
int returnCount = pattern.matchSentence(sentence, ret);
|
||||
|
||||
List<WordSpan> expected = List.of(
|
||||
new WordSpan(0, 3),
|
||||
new WordSpan(3, 6)
|
||||
);
|
||||
|
||||
System.out.println(ret);
|
||||
System.out.println(expected);
|
||||
|
||||
Assertions.assertEquals(expected, ret);
|
||||
Assertions.assertEquals(ret.size(), returnCount);
|
||||
}
|
||||
|
||||
@Test
|
||||
void matchSentence__threeTermPattern_mismatch() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1 B1 C1");
|
||||
List<WordSpan> ret = new ArrayList<>();
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern(
|
||||
new String[] {"A1", "B1", "A1", "C1", "A1", "C1"},
|
||||
new boolean[] { false, false, true, false, false, true }
|
||||
);
|
||||
|
||||
int returnCount = pattern.matchSentence(sentence, ret);
|
||||
|
||||
List<WordSpan> expected = List.of();
|
||||
|
||||
System.out.println(ret);
|
||||
System.out.println(expected);
|
||||
|
||||
Assertions.assertEquals(expected, ret);
|
||||
Assertions.assertEquals(ret.size(), returnCount);
|
||||
}
|
||||
|
||||
@Test
|
||||
void matchSentence__threeTermPattern_overlap() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1 A1 A1");
|
||||
List<WordSpan> ret = new ArrayList<>();
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern(
|
||||
new String[] {"A1", "A1", "A1", "A1"},
|
||||
new boolean[] { false, false, false, true }
|
||||
);
|
||||
|
||||
int returnCount = pattern.matchSentence(sentence, ret);
|
||||
|
||||
List<WordSpan> expected = List.of(
|
||||
new WordSpan(0, 3),
|
||||
new WordSpan(1, 4)
|
||||
);
|
||||
|
||||
System.out.println(ret);
|
||||
System.out.println(expected);
|
||||
|
||||
Assertions.assertEquals(expected, ret);
|
||||
Assertions.assertEquals(ret.size(), returnCount);
|
||||
}
|
||||
|
||||
@Test
|
||||
void matchSentence__threeTermPattern_comma() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1 B1 C1");
|
||||
List<WordSpan> ret = new ArrayList<>();
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern(
|
||||
new String[] {"A1", "B1", "C1", "A1", "B1", "C1", "A1", "B1", "C1"},
|
||||
new boolean[] { true, false, false, false, true, false, false, false, true }
|
||||
);
|
||||
|
||||
int returnCount = pattern.matchSentence(sentence, ret);
|
||||
|
||||
List<WordSpan> expected = List.of(
|
||||
new WordSpan(6, 9)
|
||||
);
|
||||
|
||||
System.out.println(ret);
|
||||
System.out.println(expected);
|
||||
|
||||
Assertions.assertEquals(expected, ret);
|
||||
Assertions.assertEquals(ret.size(), returnCount);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void isMatch__singleTermPattern() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1");
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern("A1", "B1", "A1");
|
||||
|
||||
Assertions.assertTrue(pattern.isMatch(sentence, 0));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 1));
|
||||
Assertions.assertTrue(pattern.isMatch(sentence, 2));
|
||||
}
|
||||
|
||||
@Test
|
||||
void isMatch__threeTermPattern() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1 B1 C1");
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern("A1", "B1", "A1", "B1", "C1");
|
||||
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 0));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 1));
|
||||
Assertions.assertTrue(pattern.isMatch(sentence, 2));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 3));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 4));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 5));
|
||||
}
|
||||
|
||||
@Test
|
||||
void isMatch__threeTermPattern_comma() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1 B1 C1");
|
||||
|
||||
DocumentSentence sentence = createSentenceForPattern(
|
||||
new String[] { "A1", "B1", "C1", "A1", "B1", "C1", "A1", "B1", "C1" },
|
||||
new boolean[] { true, false, false, false, true, false, false, false, true }
|
||||
);
|
||||
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 0));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 1));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 2));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 3));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 4));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 5));
|
||||
Assertions.assertTrue(pattern.isMatch(sentence, 6));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 7));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 8));
|
||||
Assertions.assertFalse(pattern.isMatch(sentence, 9));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
void matchTagPattern__singleTerm() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1");
|
||||
PosPattern matchPattern = new PosPattern(posTagger, "A1 B1 A1");
|
||||
|
||||
Assertions.assertEquals(bitSet(true, false, true), pattern.matchTagPattern(matchPattern.toArray()));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void matchTagPattern__threeTerms() {
|
||||
PosPattern pattern = new PosPattern(posTagger, "A1 B1 C1");
|
||||
PosPattern matchPattern = new PosPattern(posTagger, "A1 B1 A1 B1 C1 A1 B1 C1");
|
||||
|
||||
Assertions.assertEquals(bitSet(false, false, true, false, false, true, false, false), pattern.matchTagPattern(matchPattern.toArray()));
|
||||
}
|
||||
|
||||
|
||||
DocumentSentence createSentenceForPattern(String... tags) {
|
||||
BitSet allSet = new BitSet(tags.length);
|
||||
allSet.set(0, tags.length);
|
||||
|
||||
long[] encodedTags = new long[tags.length];
|
||||
for (int i = 0; i < tags.length; i++) {
|
||||
encodedTags[i] = posTagger.encodeTagName(tags[i]);
|
||||
}
|
||||
|
||||
return new DocumentSentence(
|
||||
allSet,
|
||||
tags,
|
||||
encodedTags,
|
||||
tags,
|
||||
EnumSet.noneOf(HtmlTag.class),
|
||||
new BitSet(tags.length),
|
||||
new BitSet(tags.length),
|
||||
allSet
|
||||
);
|
||||
}
|
||||
DocumentSentence createSentenceForPattern(String[] tags, boolean[] commas) {
|
||||
BitSet allSet = new BitSet(tags.length);
|
||||
allSet.set(0, tags.length);
|
||||
|
||||
BitSet commaSet = new BitSet(tags.length);
|
||||
for (int i = 0; i < commas.length; i++) {
|
||||
if (!commas[i]) commaSet.set(i);
|
||||
}
|
||||
|
||||
long[] encodedTags = new long[tags.length];
|
||||
for (int i = 0; i < tags.length; i++) {
|
||||
encodedTags[i] = posTagger.encodeTagName(tags[i]);
|
||||
}
|
||||
|
||||
return new DocumentSentence(
|
||||
commaSet,
|
||||
tags,
|
||||
encodedTags,
|
||||
tags,
|
||||
EnumSet.noneOf(HtmlTag.class),
|
||||
new BitSet(tags.length),
|
||||
new BitSet(tags.length),
|
||||
allSet
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
BitSet bitSet(boolean... bits) {
|
||||
BitSet ret = new BitSet(bits.length);
|
||||
for (int i = 0; i < bits.length; i++) {
|
||||
if (bits[i])
|
||||
ret.set(i);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user