mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
112 Commits
d6cfbceeea
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
f1a71e9033 | ||
|
7b525918c9 | ||
|
0f3aede66f | ||
|
88236f3836 | ||
|
ad31a22fbb | ||
|
2785ae8241 | ||
|
1ed1f2f299 | ||
|
b7d3b67a1d | ||
|
d28010b7e6 | ||
|
2689bd9eaa | ||
|
f6d5d7f196 | ||
|
abf1186fa7 | ||
|
94a77ebddf | ||
|
4e2f76a477 | ||
|
4cd1834938 | ||
|
5cbbea67ed | ||
|
b688f15550 | ||
|
f55af8ef48 | ||
|
adc815e282 | ||
|
ca8455e049 | ||
|
4ea724d2cb | ||
|
40600e7297 | ||
|
7795742538 | ||
|
82d33ce69b | ||
|
e49cc5c244 | ||
|
0af389ad93 | ||
|
48791f56bd | ||
|
be83726427 | ||
|
708caa8791 | ||
|
32394f42b9 | ||
|
b8e3445ce0 | ||
|
17a78a7b7e | ||
|
5a75dd8093 | ||
|
a9713347a0 | ||
|
4694d36ed2 | ||
|
70bdd1f51e | ||
|
187b4828e6 | ||
|
93fc14dc94 | ||
|
fbfea8539b | ||
|
0929d77247 | ||
|
db8f8c1f55 | ||
|
dcb2723386 | ||
|
00c1f495f6 | ||
|
73a923983a | ||
|
e9ed0c5669 | ||
|
5b2bec6144 | ||
|
f26bb8e2b1 | ||
|
4455495dc6 | ||
|
b84d17aa51 | ||
|
9d008390ae | ||
|
a40c2a8146 | ||
|
a3416bf48e | ||
|
ee2461d9fc | ||
|
54c91a84e3 | ||
|
a6371fc54c | ||
|
8faa9a572d | ||
|
fdce940263 | ||
|
af8a13a7fb | ||
|
9e332de6b4 | ||
|
d457bb5d44 | ||
|
c661ebb619 | ||
|
53e744398a | ||
|
1d71baf3e5 | ||
|
bb5fc0f348 | ||
|
c8f112d040 | ||
|
ae31bc8498 | ||
|
da5046c3bf | ||
|
f67257baf2 | ||
|
924fb05661 | ||
|
c231a82062 | ||
|
2c1082d7f0 | ||
|
06947bd026 | ||
|
519aebd7c6 | ||
|
42cc27586e | ||
|
360881fafd | ||
|
4c6fdf6ebe | ||
|
554de21f68 | ||
|
00194acbfe | ||
|
97dabcefaa | ||
|
cc790644d4 | ||
|
8f893ee6c0 | ||
|
938721b793 | ||
|
f68bcefc75 | ||
|
164a646af6 | ||
|
0cfd759f85 | ||
|
b53002200c | ||
|
78246b9a63 | ||
|
b552e79927 | ||
|
bffc159486 | ||
|
b8000721bd | ||
|
2ee0b0e420 | ||
|
1432fc87d7 | ||
|
ec5f32b1d8 | ||
|
edd453531e | ||
|
096496ada1 | ||
|
8ca6209260 | ||
|
673c65d3c9 | ||
|
acb9ec7b15 | ||
|
47079e05db | ||
|
c93056e77f | ||
|
6f7530e807 | ||
|
87ce4a1b52 | ||
|
52194cbe7a | ||
|
fd1ac03c78 | ||
|
5e5b86efb4 | ||
|
f332ec6191 | ||
|
c25c1af437 | ||
|
eb0c911b45 | ||
|
1979870ce4 | ||
|
0ba2ea38e1 | ||
|
b46f2e1407 | ||
|
18aa1b9764 |
9
.gitignore
vendored
9
.gitignore
vendored
@@ -7,4 +7,11 @@ build/
|
||||
lombok.config
|
||||
Dockerfile
|
||||
run
|
||||
jte-classes
|
||||
jte-classes
|
||||
.classpath
|
||||
.project
|
||||
.settings
|
||||
.factorypath
|
||||
bin/
|
||||
*.log
|
||||
*.hprof
|
||||
|
@@ -1,7 +1,7 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
id "me.champeau.jmh" version "0.7.3"
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
@@ -47,8 +47,8 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion = 24
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||
jvmVersion = 25
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:25'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.5'
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.mockito
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
|
@@ -114,4 +114,7 @@ public class WmsaHome {
|
||||
}
|
||||
|
||||
|
||||
public static Path getLangugeConfig() {
|
||||
return getHomePath().resolve("conf/languages.xml");
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,141 @@
|
||||
package nu.marginalia.proxy;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Configuration for SOCKS proxy settings used by crawlers to distribute IP footprint.
|
||||
*/
|
||||
public class SocksProxyConfiguration {
|
||||
|
||||
private final boolean enabled;
|
||||
private final List<SocksProxy> proxies;
|
||||
private final ProxySelectionStrategy strategy;
|
||||
|
||||
public SocksProxyConfiguration() {
|
||||
this.enabled = Boolean.parseBoolean(System.getProperty("crawler.socksProxy.enabled", "false"));
|
||||
this.strategy = ProxySelectionStrategy.valueOf(
|
||||
System.getProperty("crawler.socksProxy.strategy", "ROUND_ROBIN")
|
||||
);
|
||||
this.proxies = parseProxies();
|
||||
}
|
||||
|
||||
private List<SocksProxy> parseProxies() {
|
||||
String proxyList = System.getProperty("crawler.socksProxy.list", "");
|
||||
if (proxyList.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
return Arrays.stream(proxyList.split(","))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.map(this::parseProxy)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private SocksProxy parseProxy(String proxyString) {
|
||||
try {
|
||||
// Expected format: "host:port" or "host:port:username:password"
|
||||
String[] parts = proxyString.split(":");
|
||||
if (parts.length < 2) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String host = parts[0];
|
||||
int port = Integer.parseInt(parts[1]);
|
||||
|
||||
if (parts.length >= 4) {
|
||||
String username = parts[2];
|
||||
String password = parts[3];
|
||||
return new SocksProxy(host, port, username, password);
|
||||
} else {
|
||||
return new SocksProxy(host, port);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEnabled() {
|
||||
return enabled && !proxies.isEmpty();
|
||||
}
|
||||
|
||||
public List<SocksProxy> getProxies() {
|
||||
return proxies;
|
||||
}
|
||||
|
||||
public ProxySelectionStrategy getStrategy() {
|
||||
return strategy;
|
||||
}
|
||||
|
||||
public enum ProxySelectionStrategy {
|
||||
ROUND_ROBIN,
|
||||
RANDOM
|
||||
}
|
||||
|
||||
public static class SocksProxy {
|
||||
private final String host;
|
||||
private final int port;
|
||||
private final String username;
|
||||
private final String password;
|
||||
|
||||
public SocksProxy(String host, int port) {
|
||||
this(host, port, null, null);
|
||||
}
|
||||
|
||||
public SocksProxy(String host, int port, String username, String password) {
|
||||
this.host = host;
|
||||
this.port = port;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
public String getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public int getPort() {
|
||||
return port;
|
||||
}
|
||||
|
||||
public String getUsername() {
|
||||
return username;
|
||||
}
|
||||
|
||||
public String getPassword() {
|
||||
return password;
|
||||
}
|
||||
|
||||
public boolean hasAuthentication() {
|
||||
return username != null && password != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (hasAuthentication()) {
|
||||
return String.format("%s:%d (auth: %s)", host, port, username);
|
||||
} else {
|
||||
return String.format("%s:%d", host, port);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
SocksProxy that = (SocksProxy) o;
|
||||
return port == that.port &&
|
||||
Objects.equals(host, that.host) &&
|
||||
Objects.equals(username, that.username) &&
|
||||
Objects.equals(password, that.password);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(host, port, username, password);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.proxy;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* Manages SOCKS proxy selection and rotation for crawler requests.
|
||||
*/
|
||||
public class SocksProxyManager {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SocksProxyManager.class);
|
||||
|
||||
private final SocksProxyConfiguration config;
|
||||
private final AtomicInteger roundRobinIndex = new AtomicInteger(0);
|
||||
|
||||
public SocksProxyManager(SocksProxyConfiguration config) {
|
||||
this.config = config;
|
||||
|
||||
if (config.isEnabled()) {
|
||||
logger.info("SOCKS proxy support enabled with {} proxies using {} strategy",
|
||||
config.getProxies().size(), config.getStrategy());
|
||||
for (SocksProxyConfiguration.SocksProxy proxy : config.getProxies()) {
|
||||
logger.info(" - {}", proxy);
|
||||
}
|
||||
} else {
|
||||
logger.info("SOCKS proxy support disabled");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects the next proxy to use based on the configured strategy.
|
||||
*/
|
||||
@Nonnull
|
||||
public SocksProxyConfiguration.SocksProxy selectProxy() {
|
||||
if (!config.isEnabled()) {
|
||||
throw new IllegalStateException("Proxies not configured");
|
||||
}
|
||||
|
||||
List<SocksProxyConfiguration.SocksProxy> proxies = config.getProxies();
|
||||
if (proxies.isEmpty()) {
|
||||
throw new IllegalStateException("Proxies not configured");
|
||||
}
|
||||
|
||||
SocksProxyConfiguration.SocksProxy selectedProxy;
|
||||
switch (config.getStrategy()) {
|
||||
case ROUND_ROBIN:
|
||||
int index = roundRobinIndex.getAndIncrement() % proxies.size();
|
||||
selectedProxy = proxies.get(index);
|
||||
break;
|
||||
case RANDOM:
|
||||
int randomIndex = ThreadLocalRandom.current().nextInt(proxies.size());
|
||||
selectedProxy = proxies.get(randomIndex);
|
||||
break;
|
||||
default:
|
||||
selectedProxy = proxies.get(0);
|
||||
break;
|
||||
}
|
||||
|
||||
return selectedProxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current proxy configuration.
|
||||
*/
|
||||
public SocksProxyConfiguration getConfiguration() {
|
||||
return config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if proxy support is enabled and proxies are available.
|
||||
*/
|
||||
public boolean isProxyEnabled() {
|
||||
return config.isEnabled() && !config.getProxies().isEmpty();
|
||||
}
|
||||
}
|
@@ -16,7 +16,7 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
||||
<Filters>
|
||||
@@ -28,7 +28,7 @@
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -38,7 +38,7 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -56,7 +56,9 @@
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="LogToFileService"/>
|
||||
<AppenderRef ref="LogToFileCrawler"/>
|
||||
<AppenderRef ref="LogToFileConverter"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -50,7 +50,7 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n</Pattern>
|
||||
@@ -64,7 +64,7 @@
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -74,7 +74,7 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
<RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
@@ -95,7 +95,9 @@
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="LogToFileService"/>
|
||||
<AppenderRef ref="LogToFileConverer"/>
|
||||
<AppenderRef ref="LogToFileCrawler"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -1,13 +1,12 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class EdgeDomain implements Serializable {
|
||||
public class EdgeDomain {
|
||||
|
||||
@Nonnull
|
||||
public final String subDomain;
|
||||
|
@@ -4,13 +4,12 @@ import nu.marginalia.util.QueryParams;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.Serializable;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
public class EdgeUrl implements Serializable {
|
||||
public class EdgeUrl {
|
||||
public final String proto;
|
||||
public final EdgeDomain domain;
|
||||
public final Integer port;
|
||||
|
@@ -95,16 +95,24 @@ public enum HtmlFeature {
|
||||
public static int encode(Collection<HtmlFeature> featuresAll) {
|
||||
int ret = 0;
|
||||
for (var feature : featuresAll) {
|
||||
if (feature.ordinal() >= 32) continue;
|
||||
|
||||
ret |= (1 << (feature.ordinal()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean hasFeature(int value, HtmlFeature feature) {
|
||||
return (value & (1<< feature.ordinal())) != 0;
|
||||
int ord = feature.ordinal();
|
||||
if (ord >= 32) return false;
|
||||
|
||||
return (value & (1<<ord)) != 0;
|
||||
}
|
||||
|
||||
public int getFeatureBit() {
|
||||
return (1<< ordinal());
|
||||
int ord = ordinal();
|
||||
if (ord >= 32) return 0;
|
||||
|
||||
return (1<<ord);
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.model.idx;
|
||||
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
|
||||
@@ -28,7 +27,6 @@ public record DocumentMetadata(int avgSentLength,
|
||||
int sets,
|
||||
int quality,
|
||||
byte flags)
|
||||
implements Serializable
|
||||
{
|
||||
|
||||
public String toString() {
|
||||
|
@@ -66,7 +66,7 @@ public class NodeStatusWatcher {
|
||||
fileStorageService.createStorageBase("Crawl Data", Path.of("/storage"), nodeId, FileStorageBaseType.STORAGE);
|
||||
fileStorageService.createStorageBase("Work Area", Path.of("/work"), nodeId, FileStorageBaseType.WORK);
|
||||
|
||||
persistence.sendNewMessage("executor-service:"+nodeId,
|
||||
persistence.sendNewMessage("index-service:"+nodeId,
|
||||
null,
|
||||
null,
|
||||
"FIRST-BOOT",
|
||||
|
@@ -22,7 +22,6 @@ dependencies {
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:new-domain-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
@@ -34,7 +33,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:functions:language-processing')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
@@ -2,9 +2,8 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
|
||||
@Singleton
|
||||
public class ExecutionInit {
|
||||
@@ -22,5 +21,8 @@ public class ExecutionInit {
|
||||
actorControlService.start(ExecutorActor.PROC_CRAWLER_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_INDEX_CONSTRUCTOR_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_LOADER_SPAWNER);
|
||||
actorControlService.start(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER);
|
||||
actorControlService.stop(ExecutorActor.PROC_NDP_SPAWNER);
|
||||
actorControlService.stop(ExecutorActor.PROC_PING_SPAWNER);
|
||||
}
|
||||
}
|
||||
|
@@ -5,7 +5,6 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.ConverterMain;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ndp.NdpMain;
|
||||
@@ -57,7 +56,7 @@ public class ProcessSpawnerService {
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
|
||||
NDP(NdpMain.class),
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
@@ -66,6 +65,9 @@ public class ProcessSpawnerService {
|
||||
ProcessId(Class<? extends ProcessMainClass> mainClass) {
|
||||
this.mainClass = mainClass.getName();
|
||||
}
|
||||
ProcessId(String mainClassFullName) {
|
||||
this.mainClass = mainClassFullName;
|
||||
}
|
||||
|
||||
List<String> envOpts() {
|
||||
String variable = switch (this) {
|
||||
@@ -118,6 +120,17 @@ public class ProcessSpawnerService {
|
||||
args.add("-Dsystem.serviceNode=" + System.getProperty("system.serviceNode"));
|
||||
}
|
||||
|
||||
// Add SOCKS proxy properties for crawler processes
|
||||
if (System.getProperty("crawler.socksProxy.enabled") != null) {
|
||||
args.add("-Dcrawler.socksProxy.enabled=" + System.getProperty("crawler.socksProxy.enabled"));
|
||||
}
|
||||
if (System.getProperty("crawler.socksProxy.list") != null) {
|
||||
args.add("-Dcrawler.socksProxy.list=" + System.getProperty("crawler.socksProxy.list"));
|
||||
}
|
||||
if (System.getProperty("crawler.socksProxy.strategy") != null) {
|
||||
args.add("-Dcrawler.socksProxy.strategy=" + System.getProperty("crawler.socksProxy.strategy"));
|
||||
}
|
||||
|
||||
if (Boolean.getBoolean("system.profile")) {
|
||||
// add jfr options
|
||||
args.add("-XX:+FlightRecorder");
|
||||
|
@@ -5,6 +5,7 @@ import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.linkdb.LinkdbFileNames;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@@ -13,18 +14,18 @@ import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Map;
|
||||
|
||||
public class BackupService {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
private final ServiceHeartbeat serviceHeartbeat;
|
||||
|
||||
public enum BackupHeartbeatSteps {
|
||||
@@ -36,8 +37,10 @@ public class BackupService {
|
||||
|
||||
@Inject
|
||||
public BackupService(FileStorageService storageService,
|
||||
LanguageConfiguration languageConfiguration,
|
||||
ServiceHeartbeat serviceHeartbeat) {
|
||||
this.storageService = storageService;
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
}
|
||||
|
||||
@@ -98,22 +101,25 @@ public class BackupService {
|
||||
}
|
||||
|
||||
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
|
||||
{
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in input storage");
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException {
|
||||
Map<String, IndexJournal> journals = IndexJournal.findJournals(inputStorage, languageConfiguration.languages());
|
||||
for (IndexJournal journal : journals.values()) {
|
||||
FileUtils.copyDirectory(journal.journalDir().toFile(), backupStorage.resolve(journal.journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in backup");
|
||||
Map<String, IndexJournal> journals = IndexJournal.findJournals(backupStorage, languageConfiguration.languages());
|
||||
for (IndexJournal journal : journals.values()) {
|
||||
var journalFileName = journal.journalDir().getFileName();
|
||||
|
||||
// Ensure we delete any previous journal junk
|
||||
if (Files.exists(destStorage.resolve(journalFileName))) {
|
||||
FileUtils.deleteDirectory(destStorage.resolve(journalFileName).toFile());
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journalFileName).toFile(), destStorage.toFile());
|
||||
}
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
|
||||
}
|
||||
|
||||
private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException
|
||||
|
@@ -2,6 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'jvm-test-suite'
|
||||
id 'gg.jte.gradle' version '3.1.15'
|
||||
id 'application'
|
||||
}
|
||||
|
||||
java {
|
||||
@@ -9,6 +10,10 @@ java {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
application {
|
||||
mainClass = 'nu.marginalia.language.LanguageProcessingTool'
|
||||
applicationName = 'language-processing-tool'
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
@@ -18,10 +23,10 @@ dependencies {
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation libs.notnull
|
||||
implementation libs.bundles.jooby
|
@@ -163,11 +163,15 @@ public class DocumentPositionMapper {
|
||||
|
||||
int i = 0;
|
||||
|
||||
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
for (int run = 0; run < 15 && i < s.length(); run++) {
|
||||
int cp = s.codePointAt(i);
|
||||
|
||||
|
||||
if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
|
||||
i += Character.charCount(cp);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -177,17 +181,20 @@ public class DocumentPositionMapper {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
if (wordPartSeparator.indexOf(s.codePointAt(i)) < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
for (int run = 0; run < 10 && i < s.length(); run++) {
|
||||
int cp = s.codePointAt(i);
|
||||
|
||||
if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
|
||||
i += Character.charCount(cp);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
@@ -6,15 +6,20 @@ import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.keyword.LinkTexts;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
@@ -23,6 +28,7 @@ public class LanguageProcessingTool extends Jooby {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final TermFrequencyDict termFrequencyDict;
|
||||
|
||||
static void main(String[] args) {
|
||||
Jooby.runApp(args, LanguageProcessingTool::new);
|
||||
}
|
||||
@@ -33,10 +39,17 @@ public class LanguageProcessingTool extends Jooby {
|
||||
termFrequencyDict = new TermFrequencyDict(languageModels);
|
||||
|
||||
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
|
||||
new LanguageConfiguration(languageModels),
|
||||
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
|
||||
languageModels
|
||||
);
|
||||
Path basePath = Path.of("code/libraries/language-processing/").toAbsolutePath();
|
||||
|
||||
// Depending on how the tool is started, we may be in the project root, or the module root;
|
||||
// so here's some guesswork to try to suss out which one it is...
|
||||
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
|
||||
if (!Files.exists(basePath)) {
|
||||
basePath = Path.of(".").toAbsolutePath();
|
||||
}
|
||||
|
||||
System.out.println("Base path: " + basePath);
|
||||
|
||||
if (Files.exists(basePath.resolve("resources/ltt/jte")))
|
||||
@@ -55,7 +68,7 @@ public class LanguageProcessingTool extends Jooby {
|
||||
// Assign colors to the POS tags
|
||||
|
||||
@NotNull
|
||||
private ModelAndView<?> handleKeywords(Context context) {
|
||||
private ModelAndView<?> handleKeywords(Context context) throws URISyntaxException {
|
||||
if ("GET".equals(context.getMethod())) {
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", "");
|
||||
@@ -65,28 +78,38 @@ public class LanguageProcessingTool extends Jooby {
|
||||
}
|
||||
|
||||
String textSample = context.form("textSample").value();
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
|
||||
Map<Long, String> posStyles = posTagStyles(dld);
|
||||
|
||||
// Run sentende extration on the text as-is
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
|
||||
|
||||
// Run individual extraction logic
|
||||
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, dld);
|
||||
var titleKeywords = new TitleKeywords(dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
// var urlKeywords = new UrlKeywords(url);
|
||||
|
||||
// Run full extraction logic to capture positioning etc
|
||||
var extractedKeywords = new DocumentKeywordExtractor(termFrequencyDict)
|
||||
.extractKeywords(dld, new LinkTexts(), new EdgeUrl("https://www.example.com/"));
|
||||
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", textSample)
|
||||
.put("language", dld.language())
|
||||
.put("tagColors", posStyles)
|
||||
.put("tagColors", posTagStyles(dld))
|
||||
.put("sentences", dld.sentences())
|
||||
.put("tfIdfReps", tfIdfCounts.getReps())
|
||||
.put("titleReps", titleKeywords.getReps())
|
||||
.put("nameLikeReps", nameLikeKeywords.getReps())
|
||||
.put("subjectLikeReps", subjectLikeKeywords.getReps())
|
||||
.put("artifacts", artifactKeywords.getWords());
|
||||
.put("artifacts", artifactKeywords.getWords())
|
||||
.put("importantWords", extractedKeywords.importantWords)
|
||||
.put("positionedWords", extractedKeywords.wordToPos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate unique colors for each POS tag, to help the UI rendering
|
||||
*/
|
||||
public static Map<Long, String> posTagStyles(DocumentLanguageData dld) {
|
||||
Map<Long, String> styles = new HashMap<>();
|
||||
|
@@ -0,0 +1,43 @@
|
||||
package nu.marginalia.language.config;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
sealed public interface LanguageConfigLocation {
|
||||
InputStream findLanguageConfiguration() throws IOException;
|
||||
|
||||
final class Auto implements LanguageConfigLocation {
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
Path filesystemPath = WmsaHome.getLangugeConfig();
|
||||
if (Files.exists(filesystemPath)) {
|
||||
return Files.newInputStream(filesystemPath, StandardOpenOption.READ);
|
||||
}
|
||||
if (Boolean.getBoolean("language.experimental")) {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
} else {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final class Experimental implements LanguageConfigLocation {
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
|
||||
}
|
||||
}
|
||||
|
||||
final class Default implements LanguageConfigLocation {
|
||||
|
||||
@Override
|
||||
public InputStream findLanguageConfiguration() throws IOException {
|
||||
return ClassLoader.getSystemResourceAsStream("languages-default.xml");
|
||||
}
|
||||
}
|
||||
}
|
@@ -2,8 +2,10 @@ package nu.marginalia.language.config;
|
||||
|
||||
import com.github.jfasttext.JFastText;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
@@ -38,11 +40,12 @@ import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.*;
|
||||
|
||||
@Singleton
|
||||
public class LanguageConfiguration {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageConfiguration.class);
|
||||
|
||||
private final Map<String, Path> resources = new HashMap<>();
|
||||
private final Map<String, LanguageDefinition> languages = new HashMap<>();
|
||||
private final Map<String, LanguageDefinition> languages = new LinkedHashMap<>();
|
||||
private final JFastText fastTextLanguageModel = new JFastText();
|
||||
|
||||
public Optional<LanguageDefinition> identifyLanguage(org.jsoup.nodes.Document jsoupDoc) {
|
||||
@@ -78,45 +81,42 @@ public class LanguageConfiguration {
|
||||
public List<LanguageDefinition> languages() {
|
||||
return new ArrayList<>(this.languages.values());
|
||||
}
|
||||
|
||||
public Map<String, LanguageDefinition> languagesMap() {
|
||||
return Collections.unmodifiableMap(languages);
|
||||
}
|
||||
@Nullable
|
||||
public LanguageDefinition getLanguage(String language) {
|
||||
return languages.get(language);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Inject
|
||||
public LanguageConfiguration() throws IOException, ParserConfigurationException, SAXException {
|
||||
this(WmsaHome.getLanguageModels());
|
||||
this(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Auto());
|
||||
}
|
||||
|
||||
public LanguageConfiguration(LanguageModels lm)
|
||||
public LanguageConfiguration(LanguageConfigLocation languageFile) throws IOException, ParserConfigurationException, SAXException {
|
||||
this(WmsaHome.getLanguageModels(), languageFile);
|
||||
}
|
||||
|
||||
public LanguageConfiguration(LanguageModels lm, LanguageConfigLocation languageFile)
|
||||
throws IOException, ParserConfigurationException, SAXException {
|
||||
fastTextLanguageModel.loadModel(lm.fasttextLanguageModel.toString());
|
||||
|
||||
// TODO: Read from data directory
|
||||
|
||||
try (var languagesXmlStream = ClassLoader.getSystemResourceAsStream("languages.xml")) {
|
||||
try (var languagesXmlStream = languageFile.findLanguageConfiguration()) {
|
||||
if (languagesXmlStream == null)
|
||||
throw new IllegalStateException("languages.xml resource not found in classpath");
|
||||
loadConfiguration(languagesXmlStream);
|
||||
throw new IllegalStateException("languages-default.xml resource not found in classpath");
|
||||
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
Document doc = builder.parse(languagesXmlStream);
|
||||
|
||||
parseResources(doc);
|
||||
parseLanguages(doc);
|
||||
}
|
||||
|
||||
logger.info("Loaded language configuration: {}", languages);
|
||||
}
|
||||
|
||||
private void loadConfiguration(InputStream xmlData)
|
||||
throws ParserConfigurationException, IOException, SAXException {
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
Document doc = builder.parse(xmlData);
|
||||
|
||||
parseResources(doc);
|
||||
parseLanguages(doc);
|
||||
|
||||
}
|
||||
|
||||
private void parseLanguages(Document doc) {
|
||||
NodeList languageNodes = doc.getElementsByTagName("language");
|
||||
|
||||
@@ -136,9 +136,10 @@ public class LanguageConfiguration {
|
||||
KeywordHasher keywordHasher = parseHasherTag(languageTag, isoCode);
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns =
|
||||
parsePosPatterns(posTagger, languageTag, isoCode);
|
||||
UnicodeNormalization unicodeNormalization = parseUnicodeNormalization(languageTag, isoCode);
|
||||
|
||||
languages.put(isoCode,
|
||||
new LanguageDefinition(isoCode, name, stemmer, keywordHasher, posTagger, posPatterns));
|
||||
new LanguageDefinition(isoCode, name, stemmer, unicodeNormalization, keywordHasher, posTagger, posPatterns));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to set up language " + isoCode, ex);
|
||||
@@ -146,6 +147,22 @@ public class LanguageConfiguration {
|
||||
}
|
||||
}
|
||||
|
||||
private UnicodeNormalization parseUnicodeNormalization(Element languageTag, String isoCode) {
|
||||
NodeList normalizationTags = languageTag.getElementsByTagName("unicodeNormalization");
|
||||
if (normalizationTags.getLength() == 0)
|
||||
return new UnicodeNormalization.JustNormalizeQuotes();
|
||||
Element normalizationTag = (Element) normalizationTags.item(0);
|
||||
String algorithm = normalizationTag.getAttribute("algorithm");
|
||||
|
||||
return switch(algorithm) {
|
||||
case "minimal" -> new UnicodeNormalization.JustNormalizeQuotes();
|
||||
case "e-accents" -> new UnicodeNormalization.FlattenEAccents();
|
||||
case "german" -> new UnicodeNormalization.Flattenß();
|
||||
case "maximal-latin" -> new UnicodeNormalization.FlattenAllLatin();
|
||||
default -> throw new IllegalArgumentException("Invalida algorithm " + algorithm + " on language configuration for " + isoCode);
|
||||
};
|
||||
}
|
||||
|
||||
private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
|
||||
Element languageTag, String isoCode) {
|
||||
if (null == posTagger)
|
@@ -0,0 +1,227 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
public interface UnicodeNormalization {
|
||||
|
||||
String flattenUnicode(String s);
|
||||
|
||||
static final boolean NO_FLATTEN_UNICODE =
|
||||
Boolean.getBoolean("system.noFlattenUnicode");
|
||||
|
||||
class JustNormalizeQuotes implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenEAccents implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
int numCp = s.codePointCount(0, s.length());
|
||||
|
||||
for (int i = 0; i < numCp;) {
|
||||
int c = s.codePointAt(i);
|
||||
i+=Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("é".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class Flattenß implements UnicodeNormalization {
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
} else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else {
|
||||
sb.appendCodePoint(c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class FlattenAllLatin implements UnicodeNormalization {
|
||||
|
||||
public String flattenUnicode(String s) {
|
||||
if (NO_FLATTEN_UNICODE)
|
||||
return s;
|
||||
|
||||
if (isPlainAscii(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(s.length() + 10);
|
||||
|
||||
// Falsehoods programmers believe about the latin alphabet ;-)
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
i += Character.charCount(c);
|
||||
|
||||
if ("\u201C\u201D".indexOf(c) >= 0) {
|
||||
sb.append('"');
|
||||
}
|
||||
else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
|
||||
sb.append('a');
|
||||
}
|
||||
else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
|
||||
sb.append('b');
|
||||
}
|
||||
else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
|
||||
sb.append('c');
|
||||
}
|
||||
else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
|
||||
sb.append('d');
|
||||
}
|
||||
else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
|
||||
sb.append('e');
|
||||
}
|
||||
else if ("ḟƒ".indexOf(c) >= 0) {
|
||||
sb.append('f');
|
||||
}
|
||||
else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
|
||||
sb.append('g');
|
||||
}
|
||||
else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
|
||||
sb.append('i');
|
||||
}
|
||||
else if ("ĵǰɉ".indexOf(c) >= 0) {
|
||||
sb.append('j');
|
||||
}
|
||||
else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
|
||||
sb.append('k');
|
||||
}
|
||||
else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
|
||||
sb.append('l');
|
||||
}
|
||||
else if ("ḿṁṃ".indexOf(c) >= 0) {
|
||||
sb.append('m');
|
||||
}
|
||||
else if ("ŋńǹñṋňṅṇṉʼnn̈ņ".indexOf(c) >= 0) {
|
||||
sb.append('n');
|
||||
}
|
||||
else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
|
||||
sb.append('o');
|
||||
}
|
||||
else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
|
||||
sb.append('p');
|
||||
}
|
||||
else if ("ꝗ".indexOf(c) >= 0) {
|
||||
sb.append('q');
|
||||
}
|
||||
else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
|
||||
sb.append('r');
|
||||
}
|
||||
else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
|
||||
sb.append('s');
|
||||
}
|
||||
else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
|
||||
sb.append('t');
|
||||
}
|
||||
else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
|
||||
sb.append('u');
|
||||
}
|
||||
else if ("ṽṿʋỽ".indexOf(c) >= 0) {
|
||||
sb.append('v');
|
||||
}
|
||||
else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
|
||||
sb.append('w');
|
||||
}
|
||||
else if ("x̂ẍẋ".indexOf(c) >= 0) {
|
||||
sb.append('x');
|
||||
}
|
||||
else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
|
||||
sb.append('y');
|
||||
}
|
||||
else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
|
||||
sb.append('z');
|
||||
}
|
||||
else if ("Þþ".indexOf(c) >= 0) {
|
||||
sb.append("th");
|
||||
}
|
||||
else if ('ß' == c) {
|
||||
sb.append("ss");
|
||||
}
|
||||
else if (isAscii(c)) {
|
||||
sb.append((char) c);
|
||||
}
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static boolean isPlainAscii(String s) {
|
||||
for (int i = 0; i < s.length(); ) {
|
||||
int c = s.codePointAt(i);
|
||||
if (!isAscii(c))
|
||||
return false;
|
||||
i += Character.charCount(c);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean isAscii(int c) {
|
||||
return (c & ~0x7f) == 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.pos.PosPattern;
|
||||
import nu.marginalia.language.pos.PosPatternCategory;
|
||||
@@ -16,6 +17,7 @@ public final class LanguageDefinition {
|
||||
private final String isoCode;
|
||||
private final String name;
|
||||
private final Stemmer stemmer;
|
||||
private final UnicodeNormalization unicodeNormalization;
|
||||
private final KeywordHasher keywordHasher;
|
||||
|
||||
@Nullable
|
||||
@@ -25,12 +27,14 @@ public final class LanguageDefinition {
|
||||
public LanguageDefinition(String isoCode,
|
||||
String name,
|
||||
Stemmer stemmer,
|
||||
UnicodeNormalization unicodeNormalization,
|
||||
KeywordHasher keywordHasher,
|
||||
@Nullable PosTagger posTagger,
|
||||
Map<PosPatternCategory, List<PosPattern>> posPatterns) {
|
||||
this.isoCode = isoCode;
|
||||
this.name = name;
|
||||
this.stemmer = stemmer;
|
||||
this.unicodeNormalization = unicodeNormalization;
|
||||
this.keywordHasher = keywordHasher;
|
||||
this.posTagger = posTagger;
|
||||
this.posPatterns = posPatterns;
|
||||
@@ -57,6 +61,10 @@ public final class LanguageDefinition {
|
||||
return keywordHasher;
|
||||
}
|
||||
|
||||
public UnicodeNormalization unicodeNormalization() {
|
||||
return unicodeNormalization;
|
||||
}
|
||||
|
||||
public long[] posTagSentence(String[] words) {
|
||||
if (posTagger == null) return new long[0];
|
||||
return posTagger.tagSentence(words);
|
@@ -129,7 +129,7 @@ public class SentenceExtractor {
|
||||
EnumSet<HtmlTag> htmlTags) {
|
||||
final Stemmer stemmer = language.stemmer();
|
||||
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = new SentenceSegmentSplitter(language).splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
|
||||
String[] words = wordsAndSeps.words();
|
||||
BitSet seps = wordsAndSeps.separators();
|
||||
@@ -218,11 +218,13 @@ public class SentenceExtractor {
|
||||
|
||||
List<DocumentSentence> ret = new ArrayList<>(sentences.length);
|
||||
|
||||
SentenceSegmentSplitter sentenceSegmentSplitter = new SentenceSegmentSplitter(language);
|
||||
|
||||
if (isNaturalLanguage) {
|
||||
// Natural language text; do POS tagging and stemming
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = language.posTagSentence(tokens);
|
||||
@@ -274,7 +276,7 @@ public class SentenceExtractor {
|
||||
// as this is not likely to be useful
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = new long[tokens.length];
|
@@ -2,7 +2,8 @@ package nu.marginalia.language.sentence;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.encoding.UnicodeNormalization;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
@@ -13,10 +14,11 @@ import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;
|
||||
|
||||
public class SentenceSegmentSplitter {
|
||||
|
||||
private final UnicodeNormalization unicodeNormalization;
|
||||
|
||||
public record SeparatedSentence(String[] words, BitSet separators) { }
|
||||
|
||||
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
|
||||
|
||||
private static final Pattern wordBreakPattern;
|
||||
|
||||
static {
|
||||
@@ -31,13 +33,17 @@ public class SentenceSegmentSplitter {
|
||||
}
|
||||
}
|
||||
|
||||
SentenceSegmentSplitter(LanguageDefinition languageDefinition) {
|
||||
this.unicodeNormalization = languageDefinition.unicodeNormalization();
|
||||
}
|
||||
|
||||
/** Split a sentence into words and separators.
|
||||
*
|
||||
* @param segment The sentence to split
|
||||
* @return A list of words and separators
|
||||
*/
|
||||
public static SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = AsciiFlattener.flattenUnicode(segment);
|
||||
public SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = unicodeNormalization.flattenUnicode(segment);
|
||||
|
||||
var matcher = wordBreakPattern.matcher(flatSegment);
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.*;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
@@ -11,7 +10,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
public class NgramExtractorMain {
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
@@ -112,50 +111,45 @@ public class NgramExtractorMain {
|
||||
|
||||
var orderedHasher = HasherGroup.ordered();
|
||||
|
||||
var pool = new SimpleBlockingThreadPool("ngram-extractor",
|
||||
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
|
||||
32
|
||||
);
|
||||
try (var pool = new ForkJoinPool(Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32))) {
|
||||
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submit(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
});
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submit(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
}, p -> true);
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
pool.shutDown();
|
||||
pool.awaitTermination(10, TimeUnit.DAYS);
|
||||
}, p -> true);
|
||||
}
|
||||
|
||||
lexicon.saveCounts(countsOutputFile);
|
||||
}
|
@@ -5,16 +5,19 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** Dictionary with term frequency information for (stemmed) words.
|
||||
*
|
||||
@@ -38,15 +41,23 @@ public class TermFrequencyDict {
|
||||
}
|
||||
|
||||
private static Long2IntOpenHashMap load(Path file) throws IOException {
|
||||
try (LongArray array = LongArrayFactory.mmapForReadingConfined(file)) {
|
||||
try (Arena arena = Arena.ofConfined();
|
||||
FileChannel fileChannel = (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ)) {
|
||||
|
||||
int size = (int) Files.size(file) / 16;
|
||||
long fileSizeBytes = Files.size(file);
|
||||
MemorySegment mappedFile = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSizeBytes, arena);
|
||||
|
||||
int size = (int) fileSizeBytes / 16;
|
||||
var ret = new Long2IntOpenHashMap(size, 0.5f);
|
||||
|
||||
ret.defaultReturnValue(0);
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
ret.put(array.get(2 * i), (int) array.get(2 * i + 1));
|
||||
|
||||
long key = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i);
|
||||
long val = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i + 1);
|
||||
|
||||
ret.put(key, (int) val);
|
||||
}
|
||||
|
||||
return ret;
|
31
code/functions/language-processing/readme.md
Normal file
31
code/functions/language-processing/readme.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Language Processing
|
||||
|
||||
This function gathers various tools used in language processing,
|
||||
keyword extraction, and so on.
|
||||
|
||||
## Language Configuration
|
||||
|
||||
The files [resources/languages-default.xml](resources/languages-default.xml) and [resources/languages-experimental.xml](resources/languages-experimental.xml) hold the laguage definitions used by the search engine,
|
||||
the former is used in production and the latter in most tests that require language processing.
|
||||
|
||||
The search engine excludes any languages not configured in these files, though it is relatively easy to define a stub
|
||||
configuration that gets a simpler behavior out of the search engine.
|
||||
|
||||
## Language Processing Tool
|
||||
|
||||
It also houses a tool for inspecting the output of keyword extraction,
|
||||
which can be accessed by running the command below from the root of the project.
|
||||
The tool becomes accessible on port 8080.
|
||||
|
||||
```bash
|
||||
$ ./gradlew :code:functions:language-processing:run
|
||||
```
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [SentenceExtractor](java/nu/marginalia/language/sentence/SentenceExtractor.java) -
|
||||
Creates a [DocumentLanguageData](java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
|
||||
its words, how they stem, POS tags, and so on.
|
||||
* [LanguageConfiguration](java/nu/marginalia/language/config/LanguageConfiguration.java) - parses langauge configuration xml files into LanguageDefinition objects
|
||||
* [LanguageDefinition](java/nu/marginalia/language/model/LanguageDefinition.java) - holds all per-language cusotmizations that are fed into the language processing pipeline
|
||||
* [DocumentKeywordExtractor](java/nu/marginalia/keyword/DocumentKeywordExtractor.java) - extracts keywords from documents
|
@@ -0,0 +1,109 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
id ID #REQUIRED
|
||||
md5 CDATA #REQUIRED
|
||||
path CDATA #REQUIRED
|
||||
href CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ATTLIST language
|
||||
isoCode ID #REQUIRED
|
||||
name CDATA #REQUIRED
|
||||
display (rtl|ltr) #REQUIRED
|
||||
disabled (true|false) "false"
|
||||
>
|
||||
|
||||
<!ELEMENT unicodeNormalization EMPTY>
|
||||
<!ATTLIST unicodeNormalization
|
||||
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT stemmer (pospattern?)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
variant CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT keywordHash (#PCDATA)>
|
||||
<!ATTLIST keywordHash
|
||||
algorithm (asciish|utf8) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT rdrTagger EMPTY>
|
||||
<!ATTLIST rdrTagger
|
||||
dictId IDREF #REQUIRED
|
||||
rdrId IDREF #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT ngrams (pospattern*)>
|
||||
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
|
||||
|
||||
<!ELEMENT pospattern (#PCDATA)>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
<!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
|
||||
]>
|
||||
|
||||
<languages>
|
||||
<language isoCode="en" name="English" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="porter">
|
||||
<pospattern>!(IN TO CC DT)</pospattern>
|
||||
</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="maximal-latin" />
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
<ngrams type="name">
|
||||
<pospattern>NNP*</pospattern>
|
||||
<pospattern>NNP* NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
|
||||
<pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="noun">
|
||||
<pospattern>VBG</pospattern>
|
||||
<pospattern>RB VBG</pospattern>
|
||||
<pospattern>(NNP* JJ)</pospattern>
|
||||
<pospattern>(NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
|
||||
<pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="subject-suffix">
|
||||
<pospattern>(VBD VBZ)</pospattern>
|
||||
<pospattern>MD VB</pospattern>
|
||||
<pospattern>VBZ DT</pospattern>
|
||||
<pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="title">
|
||||
<pospattern>!(CC IN DT TO)</pospattern>
|
||||
<pospattern>!CC !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * !(IN DT TO)</pospattern>
|
||||
<pospattern>!CC * * !(IN DT TO)</pospattern>
|
||||
</ngrams>
|
||||
<ngrams type="keyword">
|
||||
<!-- length = 1 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
|
||||
<!-- length = 2 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 3 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
|
||||
<pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
|
||||
<!-- length = 4 -->
|
||||
<pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
|
||||
<pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
|
||||
</ngrams>
|
||||
</language>
|
||||
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
|
||||
</languages>
|
@@ -1,7 +1,7 @@
|
||||
<?xml version="1.0"?>
|
||||
<!DOCTYPE languages [
|
||||
<!ELEMENT languages (language*,resource*)>
|
||||
<!ELEMENT language (keywordHash,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
<!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
|
||||
|
||||
<!ELEMENT resource EMPTY>
|
||||
<!ATTLIST resource
|
||||
@@ -18,6 +18,11 @@
|
||||
disabled (true|false) "false"
|
||||
>
|
||||
|
||||
<!ELEMENT unicodeNormalization EMPTY>
|
||||
<!ATTLIST unicodeNormalization
|
||||
algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT stemmer (pospattern?)>
|
||||
<!ATTLIST stemmer
|
||||
algorithm (porter|snowball|none) #REQUIRED
|
||||
@@ -37,6 +42,7 @@
|
||||
|
||||
<!ELEMENT ngrams (pospattern*)>
|
||||
<!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
|
||||
|
||||
<!ELEMENT pospattern (#PCDATA)>
|
||||
|
||||
<!ELEMENT sentenceDetector EMPTY>
|
||||
@@ -44,18 +50,13 @@
|
||||
]>
|
||||
|
||||
<languages>
|
||||
<language isoCode="xx" name="Undefined" display="ltr" >
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="none" />
|
||||
<sentenceDetector algorithm="none"/>
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
</language>
|
||||
<language isoCode="en" name="English" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="porter">
|
||||
<pospattern>!(IN TO CC DT)</pospattern>
|
||||
</stemmer>
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="maximal-latin" />
|
||||
<rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
|
||||
<ngrams type="name">
|
||||
<pospattern>NNP*</pospattern>
|
||||
@@ -106,6 +107,7 @@
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="SWEDISH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="e-accents" />
|
||||
<rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
|
||||
<ngrams type="name">
|
||||
<pospattern>PROPN</pospattern>
|
||||
@@ -119,6 +121,12 @@
|
||||
<stemmer algorithm="snowball" variant="FRENCH" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
</language>
|
||||
<language isoCode="de" name="German" display="ltr">
|
||||
<keywordHash algorithm="asciish" />
|
||||
<stemmer algorithm="snowball" variant="GERMAN" />
|
||||
<sentenceDetector algorithm="opennlp"/>
|
||||
<unicodeNormalization algorithm="german" />
|
||||
</language>
|
||||
<resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
|
||||
<resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
|
||||
<resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />
|
@@ -1,7 +1,9 @@
|
||||
@import it.unimi.dsi.fastutil.ints.IntList
|
||||
@import nu.marginalia.language.model.WordRep
|
||||
@import nu.marginalia.language.model.DocumentSentence
|
||||
@import nu.marginalia.language.model.LanguageDefinition
|
||||
@import java.util.*
|
||||
@import java.util.stream.Collectors
|
||||
@import java.util.stream.IntStream
|
||||
|
||||
@param String textSample
|
||||
@@ -13,6 +15,8 @@
|
||||
@param Collection<WordRep> nameLikeReps
|
||||
@param Collection<WordRep> subjectLikeReps
|
||||
@param Collection<String> artifacts
|
||||
@param Collection<String> importantWords
|
||||
@param Map<String, IntList> positionedWords
|
||||
|
||||
|
||||
<!DOCTYPE html>
|
||||
@@ -209,7 +213,7 @@
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Title
|
||||
Artifacts
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : artifacts)
|
||||
@@ -220,9 +224,53 @@
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
|
||||
@if (importantWords != null && !importantWords.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Important Words
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : importantWords)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Full simulation outcome from keyword extraction -->
|
||||
<div class="bg-white rounded-lg shadow-sm border border-gray-200">
|
||||
<div class="p-4 border-b border-gray-200">
|
||||
<h2 class="text-lg font-semibold text-gray-900">
|
||||
<i class="fas fa-list-ol text-purple-600 mr-2"></i>
|
||||
Outcome
|
||||
</h2>
|
||||
</div>
|
||||
<div class="p-4">
|
||||
@if (positionedWords != null && !positionedWords.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Positioned Words
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : positionedWords.keySet())
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${positionedWords.get(word).stream().map(Object::toString).collect(Collectors.joining(", "))}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
@@ -2,6 +2,7 @@ package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
@@ -31,7 +32,7 @@ class DocumentKeywordExtractorTest {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
@@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.keyword.model.DocumentWordSpan;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
@@ -34,7 +35,7 @@ class DocumentPositionMapperTest {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels());
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
|
||||
english = config.getLanguage("en");
|
||||
}
|
@@ -2,6 +2,7 @@ package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
@@ -35,7 +36,7 @@ class SentenceExtractorTest {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels());
|
||||
var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
|
||||
english = config.getLanguage("en");
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
@@ -15,7 +16,7 @@ class ArtifactKeywordsTest {
|
||||
|
||||
@Test
|
||||
public void testExtractArtifacts() throws IOException, ParserConfigurationException, SAXException {
|
||||
SentenceExtractor se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels()), TestLanguageModels.getLanguageModels());
|
||||
SentenceExtractor se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental()), TestLanguageModels.getLanguageModels());
|
||||
|
||||
var artifacts = new ArtifactKeywords(se.extractSentences("Hello I'm <vlofgren@marginalia.nu>, what's up?", "hello!"));
|
||||
System.out.println(artifacts.getWords());
|
@@ -3,6 +3,7 @@ package nu.marginalia.keyword.extractors;
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
@@ -55,8 +56,8 @@ class NameLikeKeywordsTest {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
|
||||
lc = new LanguageConfiguration(WmsaHome.getLanguageModels());
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
|
||||
lc = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
en = lc.getLanguage("en");
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
@@ -49,7 +50,7 @@ class SubjectLikeKeywordsTest {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
|
||||
se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@Test
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
@@ -192,7 +193,7 @@ class TitleKeywordsTest {
|
||||
|
||||
@Test
|
||||
public void extractTitleWords() throws IOException, ParserConfigurationException, SAXException, UnsupportedLanguageException {
|
||||
var languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels());
|
||||
var languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
var se = new SentenceExtractor(languageConfiguration, TestLanguageModels.getLanguageModels());
|
||||
|
||||
var dld = se.extractSentences(Jsoup.parse(document));
|
@@ -13,19 +13,18 @@ import java.io.IOException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class LanguageConfigurationTest {
|
||||
public class LanguageConfigurationTestFile {
|
||||
private static LanguageConfiguration languageConfiguration;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException, SAXException, ParserConfigurationException {
|
||||
languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels());
|
||||
languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBasic() {
|
||||
Assertions.assertNotNull(languageConfiguration.getLanguage("en"));
|
||||
Assertions.assertNotNull(languageConfiguration.getLanguage("sv"));
|
||||
Assertions.assertNotNull(languageConfiguration.getLanguage("xx"));
|
||||
Assertions.assertNull(languageConfiguration.getLanguage("!!"));
|
||||
}
|
||||
|
@@ -0,0 +1,41 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
|
||||
class UnicodeNormalizationTest {
|
||||
|
||||
UnicodeNormalization unicodeNormalization = new UnicodeNormalization.FlattenAllLatin();
|
||||
|
||||
@Test
|
||||
void flattenUnicodePlainAscii() {
|
||||
String s = "abc";
|
||||
|
||||
// If the string is ascii, we don't want to allocate a copy
|
||||
|
||||
assertSame(s, unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void flattenUnicode() {
|
||||
String s = "Stülpnagelstraße";
|
||||
|
||||
assertEquals("Stulpnagelstrasse", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void flattenUnicode2() {
|
||||
String s = "Koncevičius";
|
||||
|
||||
assertEquals("Koncevicius", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
void omitNonFlattenable() {
|
||||
String s = "[アグレッシブ烈子]";
|
||||
|
||||
assertEquals("[]", unicodeNormalization.flattenUnicode(s));
|
||||
}
|
||||
}
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.config.LanguageConfigLocation;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.UnsupportedLanguageException;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
@@ -11,6 +12,7 @@ import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Objects;
|
||||
|
||||
@@ -23,7 +25,7 @@ class SentenceExtractorTest {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUp() throws IOException, ParserConfigurationException, SAXException {
|
||||
languageConfig = new LanguageConfiguration(WmsaHome.getLanguageModels());
|
||||
languageConfig = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
|
||||
sentenceExtractor = new SentenceExtractor(languageConfig, WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
@@ -60,7 +62,8 @@ class SentenceExtractorTest {
|
||||
void testJava() {
|
||||
var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
assertEquals(4, dld.wordsLowerCase.length);
|
||||
System.out.println(Arrays.toString(dld.wordsLowerCase));
|
||||
|
||||
assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase);
|
||||
}
|
||||
|
@@ -3,8 +3,10 @@ package nu.marginalia.domsample;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.livecapture.BrowserlessClient;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
@@ -15,24 +17,32 @@ import java.net.URISyntaxException;
|
||||
import java.time.Duration;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class DomSampleService {
|
||||
private final DomSampleDb db;
|
||||
private final HikariDataSource mariadbDataSource;
|
||||
private final int sampleThreads;
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
private final URI browserlessURI;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||
private final ArrayBlockingQueue<EdgeDomain> samplingQueue = new ArrayBlockingQueue<>(4);
|
||||
|
||||
@Inject
|
||||
public DomSampleService(DomSampleDb db,
|
||||
HikariDataSource mariadbDataSource,
|
||||
@Named("browserless-uri") String browserlessAddress,
|
||||
@Named("browserless-sample-threads") int sampleThreads,
|
||||
DomainCoordinator domainCoordinator,
|
||||
ServiceConfiguration serviceConfiguration)
|
||||
throws URISyntaxException
|
||||
{
|
||||
this.db = db;
|
||||
this.mariadbDataSource = mariadbDataSource;
|
||||
this.sampleThreads = sampleThreads;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
|
||||
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||
logger.warn("Live capture service will not run");
|
||||
@@ -40,6 +50,7 @@ public class DomSampleService {
|
||||
}
|
||||
else {
|
||||
browserlessURI = new URI(browserlessAddress);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,7 +60,10 @@ public class DomSampleService {
|
||||
return;
|
||||
}
|
||||
|
||||
Thread.ofPlatform().daemon().start(this::run);
|
||||
Thread.ofPlatform().daemon().start(this::mainThread);
|
||||
for (int i = 0; i < sampleThreads; i++) {
|
||||
Thread.ofPlatform().daemon().start(this::samplingThread);
|
||||
}
|
||||
}
|
||||
|
||||
public void syncDomains() {
|
||||
@@ -61,7 +75,11 @@ public class DomSampleService {
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_AVAILABILITY_INFORMATION
|
||||
ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
WHERE NODE_AFFINITY>0
|
||||
AND BACKOFF_CONSECUTIVE_FAILURES<15
|
||||
AND HTTP_SCHEMA='HTTPS'
|
||||
""")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
@@ -79,7 +97,7 @@ public class DomSampleService {
|
||||
logger.info("Synced domains to sqlite");
|
||||
}
|
||||
|
||||
public void run() {
|
||||
public void mainThread() {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
||||
@@ -92,8 +110,8 @@ public class DomSampleService {
|
||||
syncDomains();
|
||||
var domains = db.getScheduledDomains();
|
||||
|
||||
for (var domain : domains) {
|
||||
updateDomain(client, domain);
|
||||
for (String domain : domains) {
|
||||
samplingQueue.put(new EdgeDomain(domain));
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
@@ -103,7 +121,26 @@ public class DomSampleService {
|
||||
logger.error("Error in DomSampleService run loop", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void samplingThread() {
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
while (!Thread.currentThread().isInterrupted()) {
|
||||
try {
|
||||
EdgeDomain domain = samplingQueue.take();
|
||||
try (var lock = domainCoordinator.lockDomain(domain)) {
|
||||
updateDomain(client, domain.toString());
|
||||
} catch (Exception e) {
|
||||
logger.error("Error in DomSampleService run loop", e);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.info("DomSampleService interrupted, stopping...");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -41,7 +41,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"userAgent", Map.of("userAgent", userAgent),
|
||||
"gotoOptions", gotoOptions
|
||||
);
|
||||
|
||||
@@ -69,7 +69,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"userAgent", Map.of("userAgent", userAgent),
|
||||
"gotoOptions", gotoOptions,
|
||||
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||
);
|
||||
@@ -104,7 +104,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"userAgent", Map.of("userAgent", userAgent),
|
||||
"options", screenshotOptions,
|
||||
"gotoOptions", gotoOptions
|
||||
);
|
||||
|
@@ -6,6 +6,7 @@ import io.grpc.stub.StreamObserver;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.api.livecapture.Empty;
|
||||
import nu.marginalia.api.livecapture.LiveCaptureApiGrpc;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
@@ -33,6 +34,7 @@ public class LiveCaptureGrpcService
|
||||
private final boolean serviceEnabled;
|
||||
private final LinkedBlockingQueue<ScheduledScreenshot> requestedScreenshots = new LinkedBlockingQueue<>(128);
|
||||
private final HikariDataSource dataSource;
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
record ScheduledScreenshot(int domainId) {}
|
||||
|
||||
@@ -46,9 +48,11 @@ public class LiveCaptureGrpcService
|
||||
public LiveCaptureGrpcService(HikariDataSource dataSource,
|
||||
@Named("browserless-uri") String browserlessAddress,
|
||||
@Named("browserless-agent-threads") int threads,
|
||||
DomainCoordinator domainCoordinator,
|
||||
ServiceConfiguration serviceConfiguration
|
||||
) throws URISyntaxException {
|
||||
this.dataSource = dataSource;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
|
||||
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||
logger.warn("Live capture service will not run");
|
||||
@@ -163,7 +167,7 @@ public class LiveCaptureGrpcService
|
||||
}
|
||||
|
||||
private void grab(BrowserlessClient client, Connection conn, EdgeDomain domain) {
|
||||
try {
|
||||
try (var lock = domainCoordinator.lockDomain(domain)) {
|
||||
logger.info("Capturing {}", domain);
|
||||
|
||||
byte[] pngBytes = client.screenshot(domain.toRootUrlHttps().toString(),
|
||||
|
@@ -11,5 +11,8 @@ public class LivecaptureModule extends AbstractModule {
|
||||
bind(Integer.class)
|
||||
.annotatedWith(Names.named("browserless-agent-threads"))
|
||||
.toInstance(Integer.parseInt(System.getProperty("live-capture.browserless-agent-threads", "4")));
|
||||
bind(Integer.class)
|
||||
.annotatedWith(Names.named("browserless-sample-threads"))
|
||||
.toInstance(Integer.parseInt(System.getProperty("live-capture.browserless-sample-threads", "4")));
|
||||
}
|
||||
}
|
||||
|
@@ -110,18 +110,6 @@ public class FeedFetcherService {
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
|
@@ -22,8 +22,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:functions:language-processing')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user