(ndp) Deprioritize tumblr in the visitation order

(ndp) Clean up code
2025-10-05 21:22:39 +02:00 · 2025-10-05 12:17:46 +02:00 · 2025-10-05 12:16:05 +02:00 · 2025-10-05 11:56:41 +02:00 · 2025-10-05 11:56:31 +02:00 · 2025-10-05 10:32:05 +02:00
377 changed files with 5274 additions and 4040 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,11 @@ build/
 lombok.config
 Dockerfile
 run
-jte-classes
+jte-classes
+.classpath
+.project
+.settings
+.factorypath
+bin/
+*.log
+*.hprof
--- a/build.gradle
+++ b/build.gradle
@@ -1,7 +1,7 @@
 plugins {
    id 'java'
    id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
-    id "me.champeau.jmh" version "0.6.6"
+    id "me.champeau.jmh" version "0.7.3"

    // This is a workaround for a bug in the Jib plugin that causes it to stall randomly
    // https://github.com/GoogleContainerTools/jib/issues/3347
@@ -47,8 +47,8 @@ subprojects.forEach {it ->
 }

 ext {
-    jvmVersion = 24
-    dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
+    jvmVersion = 25
+    dockerImageBase='container-registry.oracle.com/graalvm/jdk:25'
    dockerImageTag='latest'
    dockerImageRegistry='marginalia'
    jibVersion = '3.4.5'
--- a/code/common/config/build.gradle
+++ b/code/common/config/build.gradle
@@ -19,6 +19,7 @@ dependencies {

    implementation libs.bundles.slf4j
    implementation libs.bundles.mariadb
+    implementation libs.bundles.httpcomponents
    implementation libs.mockito
    implementation libs.guava
    implementation dependencies.create(libs.guice.get()) {
--- a/code/common/config/java/nu/marginalia/WmsaHome.java
+++ b/code/common/config/java/nu/marginalia/WmsaHome.java
@@ -114,4 +114,7 @@ public class WmsaHome {
    }


+    public static Path getLangugeConfig() {
+        return getHomePath().resolve("conf/languages.xml");
+    }
 }
--- a/code/common/config/java/nu/marginalia/proxy/SocksProxyConfiguration.java
+++ b/code/common/config/java/nu/marginalia/proxy/SocksProxyConfiguration.java
@@ -0,0 +1,141 @@
+package nu.marginalia.proxy;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+import java.util.stream.Collectors;
+
+/**
+ * Configuration for SOCKS proxy settings used by crawlers to distribute IP footprint.
+ */
+public class SocksProxyConfiguration {
+    
+    private final boolean enabled;
+    private final List<SocksProxy> proxies;
+    private final ProxySelectionStrategy strategy;
+    
+    public SocksProxyConfiguration() {
+        this.enabled = Boolean.parseBoolean(System.getProperty("crawler.socksProxy.enabled", "false"));
+        this.strategy = ProxySelectionStrategy.valueOf(
+            System.getProperty("crawler.socksProxy.strategy", "ROUND_ROBIN")
+        );
+        this.proxies = parseProxies();
+    }
+    
+    private List<SocksProxy> parseProxies() {
+        String proxyList = System.getProperty("crawler.socksProxy.list", "");
+        if (proxyList.isEmpty()) {
+            return List.of();
+        }
+        
+        return Arrays.stream(proxyList.split(","))
+                .map(String::trim)
+                .filter(s -> !s.isEmpty())
+                .map(this::parseProxy)
+                .filter(Objects::nonNull)
+                .collect(Collectors.toList());
+    }
+    
+    private SocksProxy parseProxy(String proxyString) {
+        try {
+            // Expected format: "host:port" or "host:port:username:password"
+            String[] parts = proxyString.split(":");
+            if (parts.length < 2) {
+                return null;
+            }
+            
+            String host = parts[0];
+            int port = Integer.parseInt(parts[1]);
+            
+            if (parts.length >= 4) {
+                String username = parts[2];
+                String password = parts[3];
+                return new SocksProxy(host, port, username, password);
+            } else {
+                return new SocksProxy(host, port);
+            }
+        } catch (Exception e) {
+            return null;
+        }
+    }
+    
+    public boolean isEnabled() {
+        return enabled && !proxies.isEmpty();
+    }
+    
+    public List<SocksProxy> getProxies() {
+        return proxies;
+    }
+    
+    public ProxySelectionStrategy getStrategy() {
+        return strategy;
+    }
+    
+    public enum ProxySelectionStrategy {
+        ROUND_ROBIN,
+        RANDOM
+    }
+    
+    public static class SocksProxy {
+        private final String host;
+        private final int port;
+        private final String username;
+        private final String password;
+        
+        public SocksProxy(String host, int port) {
+            this(host, port, null, null);
+        }
+        
+        public SocksProxy(String host, int port, String username, String password) {
+            this.host = host;
+            this.port = port;
+            this.username = username;
+            this.password = password;
+        }
+        
+        public String getHost() {
+            return host;
+        }
+        
+        public int getPort() {
+            return port;
+        }
+        
+        public String getUsername() {
+            return username;
+        }
+        
+        public String getPassword() {
+            return password;
+        }
+        
+        public boolean hasAuthentication() {
+            return username != null && password != null;
+        }
+        
+        @Override
+        public String toString() {
+            if (hasAuthentication()) {
+                return String.format("%s:%d (auth: %s)", host, port, username);
+            } else {
+                return String.format("%s:%d", host, port);
+            }
+        }
+        
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            SocksProxy that = (SocksProxy) o;
+            return port == that.port && 
+                   Objects.equals(host, that.host) && 
+                   Objects.equals(username, that.username) && 
+                   Objects.equals(password, that.password);
+        }
+        
+        @Override
+        public int hashCode() {
+            return Objects.hash(host, port, username, password);
+        }
+    }
+}
--- a/code/common/config/java/nu/marginalia/proxy/SocksProxyManager.java
+++ b/code/common/config/java/nu/marginalia/proxy/SocksProxyManager.java
@@ -0,0 +1,79 @@
+package nu.marginalia.proxy;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.annotation.Nonnull;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Manages SOCKS proxy selection and rotation for crawler requests.
+ */
+public class SocksProxyManager {
+    private static final Logger logger = LoggerFactory.getLogger(SocksProxyManager.class);
+    
+    private final SocksProxyConfiguration config;
+    private final AtomicInteger roundRobinIndex = new AtomicInteger(0);
+    
+    public SocksProxyManager(SocksProxyConfiguration config) {
+        this.config = config;
+        
+        if (config.isEnabled()) {
+            logger.info("SOCKS proxy support enabled with {} proxies using {} strategy", 
+                       config.getProxies().size(), config.getStrategy());
+            for (SocksProxyConfiguration.SocksProxy proxy : config.getProxies()) {
+                logger.info("  - {}", proxy);
+            }
+        } else {
+            logger.info("SOCKS proxy support disabled");
+        }
+    }
+
+    /**
+     * Selects the next proxy to use based on the configured strategy.
+     */
+    @Nonnull
+    public SocksProxyConfiguration.SocksProxy selectProxy() {
+        if (!config.isEnabled()) {
+            throw new IllegalStateException("Proxies not configured");
+        }
+        
+        List<SocksProxyConfiguration.SocksProxy> proxies = config.getProxies();
+        if (proxies.isEmpty()) {
+            throw new IllegalStateException("Proxies not configured");
+        }
+        
+        SocksProxyConfiguration.SocksProxy selectedProxy;
+        switch (config.getStrategy()) {
+            case ROUND_ROBIN:
+                int index = roundRobinIndex.getAndIncrement() % proxies.size();
+                selectedProxy = proxies.get(index);
+                break;
+            case RANDOM:
+                int randomIndex = ThreadLocalRandom.current().nextInt(proxies.size());
+                selectedProxy = proxies.get(randomIndex);
+                break;
+            default:
+                selectedProxy = proxies.get(0);
+                break;
+        }
+        
+        return selectedProxy;
+    }
+    
+    /**
+     * Gets the current proxy configuration.
+     */
+    public SocksProxyConfiguration getConfiguration() {
+        return config;
+    }
+    
+    /**
+     * Checks if proxy support is enabled and proxies are available.
+     */
+    public boolean isProxyEnabled() {
+        return config.isEnabled() && !config.getProxies().isEmpty();
+    }
+}
--- a/code/common/config/resources/log4j2-json.xml
+++ b/code/common/config/resources/log4j2-json.xml
@@ -16,7 +16,7 @@
                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
            </Filters>
        </Console>
-        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
+        <RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
            <Filters>
@@ -28,7 +28,7 @@
            </Filters>
            <SizeBasedTriggeringPolicy size="10MB" />
        </RollingFile>
-        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
+        <RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <PatternLayout>
                <Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
@@ -38,7 +38,7 @@
                <MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
            </Filters>
        </RollingFile>
-        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
+        <RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <PatternLayout>
                <Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
@@ -56,7 +56,9 @@
        <Root level="info">
            <AppenderRef ref="Console"/>
            <AppenderRef ref="ProcessConsole"/>
-            <AppenderRef ref="LogToFile"/>
+            <AppenderRef ref="LogToFileService"/>
+            <AppenderRef ref="LogToFileCrawler"/>
+            <AppenderRef ref="LogToFileConverter"/>
        </Root>
    </Loggers>
 </Configuration>
--- a/code/common/config/resources/log4j2-prod.xml
+++ b/code/common/config/resources/log4j2-prod.xml
@@ -50,7 +50,7 @@
                <MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
            </Filters>
        </Console>
-        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
+        <RollingFile name="LogToFileService" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <PatternLayout>
                <Pattern>%-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n</Pattern>
@@ -64,7 +64,7 @@
                <MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
            </Filters>
        </RollingFile>
-        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
+        <RollingFile name="LogToFileCrawler" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <PatternLayout>
                <Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
@@ -74,7 +74,7 @@
                <MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
            </Filters>
        </RollingFile>
-        <RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
+        <RollingFile name="LogToFileConverter" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
                     ignoreExceptions="false">
            <PatternLayout>
                <Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
@@ -95,7 +95,9 @@
            <AppenderRef ref="ConsoleError"/>
            <AppenderRef ref="ConsoleFatal"/>
            <AppenderRef ref="ProcessConsole"/>
-            <AppenderRef ref="LogToFile"/>
+            <AppenderRef ref="LogToFileService"/>
+            <AppenderRef ref="LogToFileConverer"/>
+            <AppenderRef ref="LogToFileCrawler"/>
        </Root>
    </Loggers>
 </Configuration>
--- a/code/common/model/java/nu/marginalia/model/EdgeDomain.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeDomain.java
@@ -1,13 +1,12 @@
 package nu.marginalia.model;

 import javax.annotation.Nonnull;
-import java.io.Serializable;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.function.Predicate;
 import java.util.regex.Pattern;

-public class EdgeDomain implements Serializable {
+public class EdgeDomain {

    @Nonnull
    public final String subDomain;
--- a/code/common/model/java/nu/marginalia/model/EdgeUrl.java
+++ b/code/common/model/java/nu/marginalia/model/EdgeUrl.java
@@ -4,13 +4,12 @@ import nu.marginalia.util.QueryParams;
 import org.apache.commons.lang3.StringUtils;

 import javax.annotation.Nullable;
-import java.io.Serializable;
 import java.net.*;
 import java.nio.charset.StandardCharsets;
 import java.util.Objects;
 import java.util.Optional;

-public class EdgeUrl implements Serializable {
+public class EdgeUrl {
    public final String proto;
    public final EdgeDomain domain;
    public final Integer port;
--- a/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java
+++ b/code/common/model/java/nu/marginalia/model/crawl/HtmlFeature.java
@@ -95,16 +95,24 @@ public enum HtmlFeature {
    public static int encode(Collection<HtmlFeature> featuresAll) {
        int ret = 0;
        for (var feature : featuresAll) {
+            if (feature.ordinal() >= 32) continue;
+
            ret |= (1 << (feature.ordinal()));
        }
        return ret;
    }

    public static boolean hasFeature(int value, HtmlFeature feature) {
-        return (value & (1<< feature.ordinal())) != 0;
+        int ord = feature.ordinal();
+        if (ord >= 32) return false;
+
+        return (value & (1<<ord)) != 0;
    }

    public int getFeatureBit() {
-        return (1<< ordinal());
+        int ord = ordinal();
+        if (ord >= 32) return 0;
+
+        return (1<<ord);
    }
 }
--- a/code/common/model/java/nu/marginalia/model/idx/DocumentMetadata.java
+++ b/code/common/model/java/nu/marginalia/model/idx/DocumentMetadata.java
@@ -2,7 +2,6 @@ package nu.marginalia.model.idx;

 import nu.marginalia.model.crawl.PubDate;

-import java.io.Serializable;
 import java.util.EnumSet;
 import java.util.Set;

@@ -28,7 +27,6 @@ public record DocumentMetadata(int avgSentLength,
                               int sets,
                               int quality,
                               byte flags)
-    implements Serializable
 {

    public String toString() {
--- a/code/common/service/java/nu/marginalia/service/server/NodeStatusWatcher.java
+++ b/code/common/service/java/nu/marginalia/service/server/NodeStatusWatcher.java
@@ -66,7 +66,7 @@ public class NodeStatusWatcher {
            fileStorageService.createStorageBase("Crawl Data", Path.of("/storage"), nodeId, FileStorageBaseType.STORAGE);
            fileStorageService.createStorageBase("Work Area", Path.of("/work"), nodeId, FileStorageBaseType.WORK);

-            persistence.sendNewMessage("executor-service:"+nodeId,
+            persistence.sendNewMessage("index-service:"+nodeId,
                    null,
                    null,
                    "FIRST-BOOT",
--- a/code/execution/build.gradle
+++ b/code/execution/build.gradle
@@ -22,7 +22,6 @@ dependencies {
    implementation project(':code:processes:ping-process')
    implementation project(':code:processes:new-domain-process')
    implementation project(':code:processes:converting-process')
-    implementation project(':code:processes:index-constructor-process')

    implementation project(':code:common:config')
    implementation project(':code:common:model')
@@ -34,7 +33,7 @@ dependencies {
    implementation project(':third-party:commons-codec')

    implementation project(':code:libraries:message-queue')
-    implementation project(':code:libraries:term-frequency-dict')
+    implementation project(':code:functions:language-processing')

    implementation project(':code:functions:link-graph:api')
    implementation project(':code:functions:live-capture:api')
--- a/code/execution/java/nu/marginalia/execution/ExecutionInit.java
+++ b/code/execution/java/nu/marginalia/execution/ExecutionInit.java
@@ -2,9 +2,8 @@ package nu.marginalia.execution;

 import com.google.inject.Inject;
 import com.google.inject.Singleton;
-
-import nu.marginalia.actor.ExecutorActorControlService;
 import nu.marginalia.actor.ExecutorActor;
+import nu.marginalia.actor.ExecutorActorControlService;

@Singleton
 public class ExecutionInit {
@@ -22,5 +21,8 @@ public class ExecutionInit {
        actorControlService.start(ExecutorActor.PROC_CRAWLER_SPAWNER);
        actorControlService.start(ExecutorActor.PROC_INDEX_CONSTRUCTOR_SPAWNER);
        actorControlService.start(ExecutorActor.PROC_LOADER_SPAWNER);
+        actorControlService.start(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER);
+        actorControlService.stop(ExecutorActor.PROC_NDP_SPAWNER);
+        actorControlService.stop(ExecutorActor.PROC_PING_SPAWNER);
    }
 }
--- a/code/execution/java/nu/marginalia/process/ProcessSpawnerService.java
+++ b/code/execution/java/nu/marginalia/process/ProcessSpawnerService.java
@@ -5,7 +5,6 @@ import com.google.inject.Singleton;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.converting.ConverterMain;
 import nu.marginalia.crawl.CrawlerMain;
-import nu.marginalia.index.IndexConstructorMain;
 import nu.marginalia.livecrawler.LiveCrawlerMain;
 import nu.marginalia.loading.LoaderMain;
 import nu.marginalia.ndp.NdpMain;
@@ -57,7 +56,7 @@ public class ProcessSpawnerService {
        LIVE_CRAWLER(LiveCrawlerMain.class),
        CONVERTER(ConverterMain.class),
        LOADER(LoaderMain.class),
-        INDEX_CONSTRUCTOR(IndexConstructorMain.class),
+        INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
        NDP(NdpMain.class),
        EXPORT_TASKS(ExportTasksMain.class),
        ;
@@ -66,6 +65,9 @@ public class ProcessSpawnerService {
        ProcessId(Class<? extends ProcessMainClass> mainClass) {
            this.mainClass = mainClass.getName();
        }
+        ProcessId(String mainClassFullName) {
+            this.mainClass = mainClassFullName;
+        }

        List<String> envOpts() {
            String variable = switch (this) {
@@ -118,6 +120,17 @@ public class ProcessSpawnerService {
            args.add("-Dsystem.serviceNode=" + System.getProperty("system.serviceNode"));
        }

+        // Add SOCKS proxy properties for crawler processes
+        if (System.getProperty("crawler.socksProxy.enabled") != null) {
+            args.add("-Dcrawler.socksProxy.enabled=" + System.getProperty("crawler.socksProxy.enabled"));
+        }
+        if (System.getProperty("crawler.socksProxy.list") != null) {
+            args.add("-Dcrawler.socksProxy.list=" + System.getProperty("crawler.socksProxy.list"));
+        }
+        if (System.getProperty("crawler.socksProxy.strategy") != null) {
+            args.add("-Dcrawler.socksProxy.strategy=" + System.getProperty("crawler.socksProxy.strategy"));
+        }
+
        if (Boolean.getBoolean("system.profile")) {
            // add jfr options
            args.add("-XX:+FlightRecorder");
--- a/code/execution/java/nu/marginalia/svc/BackupService.java
+++ b/code/execution/java/nu/marginalia/svc/BackupService.java
@@ -5,6 +5,7 @@ import com.github.luben.zstd.ZstdOutputStream;
 import com.google.inject.Inject;
 import nu.marginalia.IndexLocations;
 import nu.marginalia.index.journal.IndexJournal;
+import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.linkdb.LinkdbFileNames;
 import nu.marginalia.service.control.ServiceHeartbeat;
 import nu.marginalia.storage.FileStorageService;
@@ -13,18 +14,18 @@ import nu.marginalia.storage.model.FileStorageType;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;

-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
 import java.time.LocalDateTime;
 import java.util.List;
-import java.util.Optional;
+import java.util.Map;

 public class BackupService {

    private final FileStorageService storageService;
+    private final LanguageConfiguration languageConfiguration;
    private final ServiceHeartbeat serviceHeartbeat;

    public enum BackupHeartbeatSteps {
@@ -36,8 +37,10 @@ public class BackupService {

    @Inject
    public BackupService(FileStorageService storageService,
+                         LanguageConfiguration languageConfiguration,
                         ServiceHeartbeat serviceHeartbeat) {
        this.storageService = storageService;
+        this.languageConfiguration = languageConfiguration;
        this.serviceHeartbeat = serviceHeartbeat;
    }

@@ -98,22 +101,25 @@ public class BackupService {
    }


-    private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
-    {
-        Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
-        if (journal.isEmpty()) {
-            throw new FileNotFoundException("No journal found in input storage");
+    private void backupJournal(Path inputStorage, Path backupStorage) throws IOException {
+        Map<String, IndexJournal> journals = IndexJournal.findJournals(inputStorage, languageConfiguration.languages());
+        for (IndexJournal journal : journals.values()) {
+            FileUtils.copyDirectory(journal.journalDir().toFile(), backupStorage.resolve(journal.journalDir().getFileName()).toFile());
        }
-
-        FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
    }

    private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
-        Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
-        if (journal.isEmpty()) {
-            throw new FileNotFoundException("No journal found in backup");
+        Map<String, IndexJournal> journals = IndexJournal.findJournals(backupStorage, languageConfiguration.languages());
+        for (IndexJournal journal : journals.values()) {
+            var journalFileName = journal.journalDir().getFileName();
+
+            // Ensure we delete any previous journal junk
+            if (Files.exists(destStorage.resolve(journalFileName))) {
+                FileUtils.deleteDirectory(destStorage.resolve(journalFileName).toFile());
+            }
+
+            FileUtils.copyDirectory(backupStorage.resolve(journalFileName).toFile(), destStorage.toFile());
        }
-        FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
    }

    private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException
--- a/code/functions/language-processing/build.gradle
+++ b/code/functions/language-processing/build.gradle
@@ -2,6 +2,7 @@ plugins {
    id 'java'
    id 'jvm-test-suite'
    id 'gg.jte.gradle' version '3.1.15'
+    id 'application'
 }

 java {
@@ -9,6 +10,10 @@ java {
        languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
    }
 }
+application {
+    mainClass = 'nu.marginalia.language.LanguageProcessingTool'
+    applicationName = 'language-processing-tool'
+}

 apply from: "$rootProject.projectDir/srcsets.gradle"

@@ -18,10 +23,10 @@ dependencies {
    implementation project(':third-party:rdrpostagger')
    implementation project(':third-party:porterstemmer')
    implementation project(':third-party:commons-codec')
+    implementation project(':third-party:openzim')
    implementation project(':code:common:model')
    implementation project(':code:common:service')
    implementation project(':code:libraries:easy-lsh')
-    implementation project(':code:libraries:term-frequency-dict')
    implementation project(':code:libraries:coded-sequence')
    implementation libs.notnull
    implementation libs.bundles.jooby
--- a/code/functions/language-processing/java/nu/marginalia/dom/DomPruningFilter.java
+++ b/code/functions/language-processing/java/nu/marginalia/dom/DomPruningFilter.java
--- a/code/functions/language-processing/java/nu/marginalia/dom/MeasureLengthVisitor.java
+++ b/code/functions/language-processing/java/nu/marginalia/dom/MeasureLengthVisitor.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/DocumentPositionMapper.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/DocumentPositionMapper.java
@@ -163,11 +163,15 @@ public class DocumentPositionMapper {

        int i = 0;

-        for (int run = 0; run < 15 && i < s.length(); run++, i++) {
-            char c = s.charAt(i);
-            if (c >= 'a' && c <= 'z') continue;
-            if (c >= 'A' && c <= 'Z') continue;
-            if (c >= '0' && c <= '9') continue;
+        for (int run = 0; run < 15 && i < s.length(); run++) {
+            int cp = s.codePointAt(i);
+
+
+            if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
+                i += Character.charCount(cp);
+                continue;
+            }
+
            break;
        }

@@ -177,17 +181,20 @@ public class DocumentPositionMapper {
        for (int j = 0; j < 8; j++) {
            if (i == s.length()) return true;

-            if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
+            if (wordPartSeparator.indexOf(s.codePointAt(i)) < 0) {
                return false;
            }

            i++;

-            for (int run = 0; run < 10 && i < s.length(); run++, i++) {
-                char c = s.charAt(i);
-                if (c >= 'a' && c <= 'z') continue;
-                if (c >= 'A' && c <= 'Z') continue;
-                if (c >= '0' && c <= '9') continue;
+            for (int run = 0; run < 10 && i < s.length(); run++) {
+                int cp = s.codePointAt(i);
+
+                if (Character.isAlphabetic(cp) || Character.isDigit(cp)) {
+                    i += Character.charCount(cp);
+                    continue;
+                }
+
                break;
            }
        }
--- a/code/functions/language-processing/java/nu/marginalia/keyword/KeywordMetadata.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/KeywordMetadata.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/LinkTexts.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/LinkTexts.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/SpanRecorder.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/SpanRecorder.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/extractors/TitleKeywords.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/extractors/TitleKeywords.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/extractors/UrlKeywords.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/extractors/UrlKeywords.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/extractors/WordReps.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/extractors/WordReps.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/model/DocumentKeywords.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/model/DocumentKeywords.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java
--- a/code/functions/language-processing/java/nu/marginalia/keyword/model/DocumentWordSpan.java
+++ b/code/functions/language-processing/java/nu/marginalia/keyword/model/DocumentWordSpan.java
--- a/code/functions/language-processing/java/nu/marginalia/language/LanguageProcessingTool.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/LanguageProcessingTool.java
@@ -6,15 +6,20 @@ import io.jooby.MapModelAndView;
 import io.jooby.ModelAndView;
 import nu.marginalia.LanguageModels;
 import nu.marginalia.WmsaHome;
+import nu.marginalia.keyword.DocumentKeywordExtractor;
+import nu.marginalia.keyword.LinkTexts;
 import nu.marginalia.keyword.extractors.*;
+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
+import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.term_frequency_dict.TermFrequencyDict;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import java.net.URISyntaxException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.*;
@@ -23,6 +28,7 @@ public class LanguageProcessingTool extends Jooby {
    private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
    private final ThreadLocalSentenceExtractorProvider  sentenceExtractorProvider;
    private final TermFrequencyDict termFrequencyDict;
+
    static void main(String[] args) {
        Jooby.runApp(args, LanguageProcessingTool::new);
    }
@@ -33,10 +39,17 @@ public class LanguageProcessingTool extends Jooby {
            termFrequencyDict = new TermFrequencyDict(languageModels);

            sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
-                    new LanguageConfiguration(languageModels),
+                    new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
                    languageModels
            );
-            Path basePath = Path.of("code/libraries/language-processing/").toAbsolutePath();
+
+            // Depending on how the tool is started, we may be in the project root, or the module root;
+            // so here's some guesswork to try to suss out which one it is...
+            Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
+            if (!Files.exists(basePath)) {
+                basePath = Path.of(".").toAbsolutePath();
+            }
+
            System.out.println("Base path: " + basePath);

            if (Files.exists(basePath.resolve("resources/ltt/jte")))
@@ -55,7 +68,7 @@ public class LanguageProcessingTool extends Jooby {
    // Assign colors to the POS tags

    @NotNull
-    private ModelAndView<?> handleKeywords(Context context) {
+    private ModelAndView<?> handleKeywords(Context context) throws URISyntaxException {
        if ("GET".equals(context.getMethod())) {
           return new MapModelAndView("keywords.jte")
                   .put("textSample", "");
@@ -65,28 +78,38 @@ public class LanguageProcessingTool extends Jooby {
        }

        String textSample = context.form("textSample").value();
-        DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
-        Map<Long, String> posStyles = posTagStyles(dld);

+        // Run sentende extration on the text as-is
+        DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
+
+        // Run individual extraction logic
        var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, dld);
        var titleKeywords = new TitleKeywords(dld);
        var nameLikeKeywords = new NameLikeKeywords(dld, 2);
        var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
        var artifactKeywords = new ArtifactKeywords(dld);
-//        var urlKeywords = new UrlKeywords(url);
+
+        // Run full extraction logic to capture positioning etc
+        var extractedKeywords = new DocumentKeywordExtractor(termFrequencyDict)
+                .extractKeywords(dld, new LinkTexts(), new EdgeUrl("https://www.example.com/"));

        return new MapModelAndView("keywords.jte")
                .put("textSample", textSample)
                .put("language", dld.language())
-                .put("tagColors", posStyles)
+                .put("tagColors", posTagStyles(dld))
                .put("sentences", dld.sentences())
                .put("tfIdfReps", tfIdfCounts.getReps())
                .put("titleReps", titleKeywords.getReps())
                .put("nameLikeReps", nameLikeKeywords.getReps())
                .put("subjectLikeReps", subjectLikeKeywords.getReps())
-                .put("artifacts", artifactKeywords.getWords());
+                .put("artifacts", artifactKeywords.getWords())
+                .put("importantWords", extractedKeywords.importantWords)
+                .put("positionedWords", extractedKeywords.wordToPos);
    }

+    /**
+     * Generate unique colors for each POS tag, to help the UI rendering
+     */
    public static Map<Long, String> posTagStyles(DocumentLanguageData dld) {
        Map<Long, String> styles = new HashMap<>();

--- a/code/functions/language-processing/java/nu/marginalia/language/WordPatterns.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/WordPatterns.java
--- a/code/functions/language-processing/java/nu/marginalia/language/config/LanguageConfigLocation.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/config/LanguageConfigLocation.java
@@ -0,0 +1,43 @@
+package nu.marginalia.language.config;
+
+import nu.marginalia.WmsaHome;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+
+sealed public interface LanguageConfigLocation {
+    InputStream findLanguageConfiguration() throws IOException;
+
+    final class Auto implements LanguageConfigLocation {
+        @Override
+        public InputStream findLanguageConfiguration() throws IOException {
+            Path filesystemPath = WmsaHome.getLangugeConfig();
+            if (Files.exists(filesystemPath)) {
+                return Files.newInputStream(filesystemPath, StandardOpenOption.READ);
+            }
+            if (Boolean.getBoolean("language.experimental")) {
+                return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
+            } else {
+                return ClassLoader.getSystemResourceAsStream("languages-default.xml");
+            }
+        }
+    }
+
+    final class Experimental implements LanguageConfigLocation {
+        @Override
+        public InputStream findLanguageConfiguration() throws IOException {
+            return ClassLoader.getSystemResourceAsStream("languages-experimental.xml");
+        }
+    }
+
+    final class Default implements LanguageConfigLocation {
+
+        @Override
+        public InputStream findLanguageConfiguration() throws IOException {
+            return ClassLoader.getSystemResourceAsStream("languages-default.xml");
+        }
+    }
+}
--- a/code/functions/language-processing/java/nu/marginalia/language/config/LanguageConfiguration.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/config/LanguageConfiguration.java
@@ -2,8 +2,10 @@ package nu.marginalia.language.config;

 import com.github.jfasttext.JFastText;
 import com.google.inject.Inject;
+import com.google.inject.Singleton;
 import nu.marginalia.LanguageModels;
 import nu.marginalia.WmsaHome;
+import nu.marginalia.language.encoding.UnicodeNormalization;
 import nu.marginalia.language.keywords.KeywordHasher;
 import nu.marginalia.language.model.LanguageDefinition;
 import nu.marginalia.language.pos.PosPattern;
@@ -38,11 +40,12 @@ import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.*;

+@Singleton
 public class LanguageConfiguration {
    private static final Logger logger = LoggerFactory.getLogger(LanguageConfiguration.class);

    private final Map<String, Path> resources = new HashMap<>();
-    private final Map<String, LanguageDefinition> languages = new HashMap<>();
+    private final Map<String, LanguageDefinition> languages = new LinkedHashMap<>();
    private final JFastText fastTextLanguageModel = new JFastText();

    public Optional<LanguageDefinition> identifyLanguage(org.jsoup.nodes.Document jsoupDoc) {
@@ -78,45 +81,42 @@ public class LanguageConfiguration {
    public List<LanguageDefinition> languages() {
        return new ArrayList<>(this.languages.values());
    }
-
+    public Map<String, LanguageDefinition> languagesMap() {
+        return Collections.unmodifiableMap(languages);
+    }
    @Nullable
    public LanguageDefinition getLanguage(String language) {
        return languages.get(language);
    }

-
-
    @Inject
    public LanguageConfiguration() throws IOException, ParserConfigurationException, SAXException {
-        this(WmsaHome.getLanguageModels());
+        this(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Auto());
    }

-    public LanguageConfiguration(LanguageModels lm)
+    public LanguageConfiguration(LanguageConfigLocation languageFile) throws IOException, ParserConfigurationException, SAXException {
+        this(WmsaHome.getLanguageModels(), languageFile);
+    }
+
+    public LanguageConfiguration(LanguageModels lm, LanguageConfigLocation languageFile)
            throws IOException, ParserConfigurationException, SAXException {
        fastTextLanguageModel.loadModel(lm.fasttextLanguageModel.toString());

-        // TODO: Read from data directory
-
-        try (var languagesXmlStream = ClassLoader.getSystemResourceAsStream("languages.xml")) {
+        try (var languagesXmlStream = languageFile.findLanguageConfiguration()) {
            if (languagesXmlStream == null)
-                throw new IllegalStateException("languages.xml resource not found in classpath");
-            loadConfiguration(languagesXmlStream);
+                throw new IllegalStateException("languages-default.xml resource not found in classpath");
+
+            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+            DocumentBuilder builder = factory.newDocumentBuilder();
+            Document doc = builder.parse(languagesXmlStream);
+
+            parseResources(doc);
+            parseLanguages(doc);
        }

        logger.info("Loaded language configuration: {}", languages);
    }

-    private void loadConfiguration(InputStream xmlData)
-            throws ParserConfigurationException, IOException, SAXException {
-        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-        DocumentBuilder builder = factory.newDocumentBuilder();
-        Document doc = builder.parse(xmlData);
-
-        parseResources(doc);
-        parseLanguages(doc);
-
-    }
-
    private void parseLanguages(Document doc) {
        NodeList languageNodes = doc.getElementsByTagName("language");

@@ -136,9 +136,10 @@ public class LanguageConfiguration {
                KeywordHasher keywordHasher = parseHasherTag(languageTag, isoCode);
                Map<PosPatternCategory, List<PosPattern>> posPatterns =
                        parsePosPatterns(posTagger, languageTag, isoCode);
+                UnicodeNormalization unicodeNormalization = parseUnicodeNormalization(languageTag, isoCode);

                languages.put(isoCode,
-                        new LanguageDefinition(isoCode, name, stemmer, keywordHasher, posTagger, posPatterns));
+                        new LanguageDefinition(isoCode, name, stemmer, unicodeNormalization, keywordHasher, posTagger, posPatterns));
            }
            catch (IOException ex) {
                logger.error("Failed to set up language " + isoCode, ex);
@@ -146,6 +147,22 @@ public class LanguageConfiguration {
        }
    }

+    private UnicodeNormalization parseUnicodeNormalization(Element languageTag, String isoCode) {
+        NodeList normalizationTags = languageTag.getElementsByTagName("unicodeNormalization");
+        if (normalizationTags.getLength() == 0)
+            return new UnicodeNormalization.JustNormalizeQuotes();
+        Element normalizationTag = (Element) normalizationTags.item(0);
+        String algorithm = normalizationTag.getAttribute("algorithm");
+
+        return switch(algorithm) {
+            case "minimal" -> new UnicodeNormalization.JustNormalizeQuotes();
+            case "e-accents" -> new UnicodeNormalization.FlattenEAccents();
+            case "german" -> new UnicodeNormalization.Flattenß();
+            case "maximal-latin" -> new UnicodeNormalization.FlattenAllLatin();
+            default -> throw new IllegalArgumentException("Invalida algorithm " + algorithm + " on language configuration for " + isoCode);
+        };
+    }
+
    private Map<PosPatternCategory, List<PosPattern>> parsePosPatterns(@Nullable PosTagger posTagger,
                                                                       Element languageTag, String isoCode) {
        if (null == posTagger)
--- a/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeNormalization.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeNormalization.java
@@ -0,0 +1,227 @@
+package nu.marginalia.language.encoding;
+
+public interface UnicodeNormalization {
+
+    String flattenUnicode(String s);
+
+    static final boolean NO_FLATTEN_UNICODE =
+            Boolean.getBoolean("system.noFlattenUnicode");
+
+    class JustNormalizeQuotes implements UnicodeNormalization {
+        public String flattenUnicode(String s) {
+            if (NO_FLATTEN_UNICODE)
+                return s;
+
+            if (isPlainAscii(s)) {
+                return s;
+            }
+
+            StringBuilder sb = new StringBuilder(s.length() + 10);
+
+            for (int i = 0; i < s.length(); ) {
+                int c = s.codePointAt(i);
+                i += Character.charCount(c);
+
+                if ("\u201C\u201D".indexOf(c) >= 0) {
+                    sb.append('"');
+                }
+                else {
+                    sb.appendCodePoint(c);
+                }
+            }
+
+            return sb.toString();
+        }
+    }
+
+    class FlattenEAccents implements UnicodeNormalization {
+        public String flattenUnicode(String s) {
+            if (NO_FLATTEN_UNICODE)
+                return s;
+
+            if (isPlainAscii(s)) {
+                return s;
+            }
+
+            StringBuilder sb = new StringBuilder(s.length() + 10);
+
+            int numCp = s.codePointCount(0, s.length());
+
+            for (int i = 0; i < numCp;) {
+                int c = s.codePointAt(i);
+                i+=Character.charCount(c);
+
+                if ("\u201C\u201D".indexOf(c) >= 0) {
+                    sb.append('"');
+                }
+                else if ("é".indexOf(c) >= 0) {
+                    sb.append('e');
+                }
+                else {
+                    sb.appendCodePoint(c);
+                }
+            }
+
+            return sb.toString();
+        }
+    }
+
+    class Flattenß implements UnicodeNormalization {
+        public String flattenUnicode(String s) {
+            if (NO_FLATTEN_UNICODE)
+                return s;
+
+            if (isPlainAscii(s)) {
+                return s;
+            }
+
+            StringBuilder sb = new StringBuilder(s.length() + 10);
+
+            for (int i = 0; i < s.length(); ) {
+                int c = s.codePointAt(i);
+                i += Character.charCount(c);
+
+                if ("\u201C\u201D".indexOf(c) >= 0) {
+                    sb.append('"');
+                } else if ('ß' == c) {
+                    sb.append("ss");
+                }
+                else {
+                    sb.appendCodePoint(c);
+                }
+            }
+
+            return sb.toString();
+        }
+    }
+
+    class FlattenAllLatin implements UnicodeNormalization {
+
+        public String flattenUnicode(String s) {
+            if (NO_FLATTEN_UNICODE)
+                return s;
+
+            if (isPlainAscii(s)) {
+                return s;
+            }
+
+            StringBuilder sb = new StringBuilder(s.length() + 10);
+
+            // Falsehoods programmers believe about the latin alphabet ;-)
+            for (int i = 0; i < s.length(); ) {
+                int c = s.codePointAt(i);
+                i += Character.charCount(c);
+
+                if ("\u201C\u201D".indexOf(c) >= 0) {
+                    sb.append('"');
+                }
+                else if ("áâàȁăåäāǟãąą̊ḁẚⱥ".indexOf(c) >= 0) {
+                    sb.append('a');
+                }
+                else if ("ḃḅḇƀɓ".indexOf(c) >= 0) {
+                    sb.append('b');
+                }
+                else if ("ćĉčçḉċƈȼ".indexOf(c) >= 0) {
+                    sb.append('c');
+                }
+                else if ("ɗḓďḋḍḏḑđðɖḏ".indexOf(c) >= 0) {
+                    sb.append('d');
+                }
+                else if ("éêèȅěëēẽĕęėẹȇḕḗḙḛḝɇ".indexOf(c) >= 0) {
+                    sb.append('e');
+                }
+                else if ("ḟƒ".indexOf(c) >= 0) {
+                    sb.append('f');
+                }
+                else if ("ǵĝǧğġģɠḡǥ".indexOf(c) >= 0) {
+                    sb.append('g');
+                }
+                else if ("ĥȟḧḣḥẖḩḫħⱨ".indexOf(c) >= 0) {
+                    sb.append('g');
+                }
+                else if ("iıíîìȉïḯīĩįịḭ".indexOf(c) >= 0) {
+                    sb.append('i');
+                }
+                else if ("ĵǰɉ".indexOf(c) >= 0) {
+                    sb.append('j');
+                }
+                else if ("ḱǩķḳḵƙⱪ".indexOf(c) >= 0) {
+                    sb.append('k');
+                }
+                else if ("ĺłḽľļḷḹḻƚɫⱡ".indexOf(c) >= 0) {
+                    sb.append('l');
+                }
+                else if ("ḿṁṃ".indexOf(c) >= 0) {
+                    sb.append('m');
+                }
+                else if ("ŋńǹñṋňṅṇṉŉn̈ņ".indexOf(c) >= 0) {
+                    sb.append('n');
+                }
+                else if ("óőôòȍŏȯȱöȫōṓṑõṍṏȭøǿǫǭọȏơ".indexOf(c) >= 0) {
+                    sb.append('o');
+                }
+                else if ("ṕṗƥᵽ".indexOf(c) >= 0) {
+                    sb.append('p');
+                }
+                else if ("ꝗ".indexOf(c) >= 0) {
+                    sb.append('q');
+                }
+                else if ("ŕȑřŗṙṛṝṟɍɽ".indexOf(c) >= 0) {
+                    sb.append('r');
+                }
+                else if ("śṥŝšṧşșṡṣṩ".indexOf(c) >= 0) {
+                    sb.append('s');
+                }
+                else if ("ťṱẗţțŧṫṭṯⱦ".indexOf(c) >= 0) {
+                    sb.append('t');
+                }
+                else if ("úùûŭưűüūṻųůũṹụṳṵṷʉ".indexOf(c) >= 0) {
+                    sb.append('u');
+                }
+                else if ("ṽṿʋỽ".indexOf(c) >= 0) {
+                    sb.append('v');
+                }
+                else if ("ẃŵẁẅẘẇẉⱳ".indexOf(c) >= 0) {
+                    sb.append('w');
+                }
+                else if ("x̂ẍẋ".indexOf(c) >= 0) {
+                    sb.append('x');
+                }
+                else if ("ƴýŷỳÿȳỹẙẏy̨ɏỿ".indexOf(c) >= 0) {
+                    sb.append('y');
+                }
+                else if ("źẑžżẓẕƶȥ".indexOf(c) >= 0) {
+                    sb.append('z');
+                }
+                else if ("Þþ".indexOf(c) >= 0) {
+                    sb.append("th");
+                }
+                else if ('ß' == c) {
+                    sb.append("ss");
+                }
+                else if (isAscii(c)) {
+                    sb.append((char) c);
+                }
+            }
+
+            return sb.toString();
+        }
+
+    }
+
+    private static boolean isPlainAscii(String s) {
+        for (int i = 0; i < s.length(); ) {
+            int c = s.codePointAt(i);
+            if (!isAscii(c))
+                return false;
+            i += Character.charCount(c);
+        }
+        return true;
+    }
+
+    private static boolean isAscii(int c) {
+        return (c & ~0x7f) == 0;
+    }
+
+
+}
--- a/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeRanges.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/encoding/UnicodeRanges.java
--- a/code/functions/language-processing/java/nu/marginalia/language/keywords/KeywordHasher.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/keywords/KeywordHasher.java
--- a/code/functions/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java
--- a/code/functions/language-processing/java/nu/marginalia/language/model/DocumentSentence.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/model/DocumentSentence.java
--- a/code/functions/language-processing/java/nu/marginalia/language/model/LanguageDefinition.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/model/LanguageDefinition.java
@@ -1,6 +1,7 @@
 package nu.marginalia.language.model;

 import nu.marginalia.language.WordPatterns;
+import nu.marginalia.language.encoding.UnicodeNormalization;
 import nu.marginalia.language.keywords.KeywordHasher;
 import nu.marginalia.language.pos.PosPattern;
 import nu.marginalia.language.pos.PosPatternCategory;
@@ -16,6 +17,7 @@ public final class LanguageDefinition {
    private final String isoCode;
    private final String name;
    private final Stemmer stemmer;
+    private final UnicodeNormalization unicodeNormalization;
    private final KeywordHasher keywordHasher;

    @Nullable
@@ -25,12 +27,14 @@ public final class LanguageDefinition {
    public LanguageDefinition(String isoCode,
                              String name,
                              Stemmer stemmer,
+                              UnicodeNormalization unicodeNormalization,
                              KeywordHasher keywordHasher,
                              @Nullable PosTagger posTagger,
                              Map<PosPatternCategory, List<PosPattern>> posPatterns) {
        this.isoCode = isoCode;
        this.name = name;
        this.stemmer = stemmer;
+        this.unicodeNormalization = unicodeNormalization;
        this.keywordHasher = keywordHasher;
        this.posTagger = posTagger;
        this.posPatterns = posPatterns;
@@ -57,6 +61,10 @@ public final class LanguageDefinition {
        return keywordHasher;
    }

+    public UnicodeNormalization unicodeNormalization() {
+        return unicodeNormalization;
+    }
+
    public long[] posTagSentence(String[] words) {
        if (posTagger == null) return new long[0];
        return posTagger.tagSentence(words);
--- a/code/functions/language-processing/java/nu/marginalia/language/model/UnsupportedLanguageException.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/model/UnsupportedLanguageException.java
--- a/code/functions/language-processing/java/nu/marginalia/language/model/WordFrequencyData.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/model/WordFrequencyData.java
--- a/code/functions/language-processing/java/nu/marginalia/language/model/WordRep.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/model/WordRep.java
--- a/code/functions/language-processing/java/nu/marginalia/language/model/WordSpan.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/model/WordSpan.java
--- a/code/functions/language-processing/java/nu/marginalia/language/pos/PosPattern.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/pos/PosPattern.java
--- a/code/functions/language-processing/java/nu/marginalia/language/pos/PosPatternCategory.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/pos/PosPatternCategory.java
--- a/code/functions/language-processing/java/nu/marginalia/language/pos/PosTagger.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/pos/PosTagger.java
--- a/code/functions/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
@@ -129,7 +129,7 @@ public class SentenceExtractor {
                                            EnumSet<HtmlTag> htmlTags) {
        final Stemmer stemmer = language.stemmer();

-        var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
+        var wordsAndSeps = new SentenceSegmentSplitter(language).splitSegment(text, MAX_SENTENCE_LENGTH);

        String[] words = wordsAndSeps.words();
        BitSet seps = wordsAndSeps.separators();
@@ -218,11 +218,13 @@ public class SentenceExtractor {

        List<DocumentSentence> ret = new ArrayList<>(sentences.length);

+        SentenceSegmentSplitter sentenceSegmentSplitter = new SentenceSegmentSplitter(language);
+
        if (isNaturalLanguage) {
            // Natural language text;  do POS tagging and stemming

            for (String sent : sentences) {
-                var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
+                var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
                var tokens = wordsAndSeps.words();
                var separators = wordsAndSeps.separators();
                var posTags = language.posTagSentence(tokens);
@@ -274,7 +276,7 @@ public class SentenceExtractor {
            // as this is not likely to be useful

            for (String sent : sentences) {
-                var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
+                var wordsAndSeps = sentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
                var tokens = wordsAndSeps.words();
                var separators = wordsAndSeps.separators();
                var posTags = new long[tokens.length];
--- a/code/functions/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java
--- a/code/functions/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java
@@ -2,7 +2,8 @@ package nu.marginalia.language.sentence;

 import com.google.common.base.CharMatcher;
 import gnu.trove.list.array.TIntArrayList;
-import nu.marginalia.language.encoding.AsciiFlattener;
+import nu.marginalia.language.encoding.UnicodeNormalization;
+import nu.marginalia.language.model.LanguageDefinition;

 import java.util.ArrayList;
 import java.util.BitSet;
@@ -13,10 +14,11 @@ import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;

 public class SentenceSegmentSplitter {

+    private final UnicodeNormalization unicodeNormalization;
+
    public record SeparatedSentence(String[] words, BitSet separators) { }

    private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
-
    private static final Pattern wordBreakPattern;

    static {
@@ -31,13 +33,17 @@ public class SentenceSegmentSplitter {
        }
    }

+    SentenceSegmentSplitter(LanguageDefinition languageDefinition) {
+        this.unicodeNormalization = languageDefinition.unicodeNormalization();
+    }
+
    /** Split a sentence into words and separators.
     *
     * @param segment The sentence to split
     * @return A list of words and separators
     */
-    public static SeparatedSentence splitSegment(String segment, int maxLength) {
-        String flatSegment = AsciiFlattener.flattenUnicode(segment);
+    public SeparatedSentence splitSegment(String segment, int maxLength) {
+        String flatSegment = unicodeNormalization.flattenUnicode(segment);

        var matcher = wordBreakPattern.matcher(flatSegment);

--- a/code/functions/language-processing/java/nu/marginalia/language/sentence/ThreadLocalSentenceExtractorProvider.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/sentence/ThreadLocalSentenceExtractorProvider.java
--- a/code/functions/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java
--- a/code/functions/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java
--- a/code/functions/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java
--- a/code/functions/language-processing/java/nu/marginalia/language/stemming/Stemmer.java
+++ b/code/functions/language-processing/java/nu/marginalia/language/stemming/Stemmer.java
--- a/code/functions/language-processing/java/nu/marginalia/segmentation/BasicSentenceExtractor.java
+++ b/code/functions/language-processing/java/nu/marginalia/segmentation/BasicSentenceExtractor.java
--- a/code/functions/language-processing/java/nu/marginalia/segmentation/HasherGroup.java
+++ b/code/functions/language-processing/java/nu/marginalia/segmentation/HasherGroup.java
--- a/code/functions/language-processing/java/nu/marginalia/segmentation/NgramExtractorMain.java
+++ b/code/functions/language-processing/java/nu/marginalia/segmentation/NgramExtractorMain.java
@@ -1,7 +1,6 @@
 package nu.marginalia.segmentation;

-import it.unimi.dsi.fastutil.longs.*;
-import nu.marginalia.util.SimpleBlockingThreadPool;
+import it.unimi.dsi.fastutil.longs.LongArrayList;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.openzim.ZIMTypes.ZIMFile;
@@ -11,7 +10,7 @@ import java.io.IOException;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.concurrent.TimeUnit;
+import java.util.concurrent.ForkJoinPool;

 public class NgramExtractorMain {
    public static void main(String... args) throws IOException, InterruptedException {
@@ -112,50 +111,45 @@ public class NgramExtractorMain {

        var orderedHasher = HasherGroup.ordered();

-        var pool = new SimpleBlockingThreadPool("ngram-extractor",
-                Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
-                32
-                );
+        try (var pool = new ForkJoinPool(Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32))) {

-        reader.forEachTitles((title) -> {
-            pool.submitQuietly(() -> {
-                LongArrayList orderedHashesTitle = new LongArrayList();
+            reader.forEachTitles((title) -> {
+                pool.submit(() -> {
+                    LongArrayList orderedHashesTitle = new LongArrayList();

-                String normalizedTitle = title.replace('_', ' ');
+                    String normalizedTitle = title.replace('_', ' ');

-                for (var sent : getNgramTitleTerms(normalizedTitle)) {
-                    String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
-                    orderedHashesTitle.add(orderedHasher.rollingHash(terms));
-                }
-                synchronized (lexicon) {
-                    for (var hash : orderedHashesTitle) {
-                        lexicon.incOrderedTitle(hash);
+                    for (var sent : getNgramTitleTerms(normalizedTitle)) {
+                        String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
+                        orderedHashesTitle.add(orderedHasher.rollingHash(terms));
                    }
-                }
+                    synchronized (lexicon) {
+                        for (var hash : orderedHashesTitle) {
+                            lexicon.incOrderedTitle(hash);
+                        }
+                    }
+                });
+
            });

-        });
+            reader.forEachArticles((title, body) -> {
+                pool.submit(() -> {
+                    LongArrayList orderedHashesBody = new LongArrayList();

-        reader.forEachArticles((title, body) -> {
-            pool.submitQuietly(() -> {
-                LongArrayList orderedHashesBody = new LongArrayList();
-
-                for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
-                    String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
-                    orderedHashesBody.add(orderedHasher.rollingHash(terms));
-                }
-
-                synchronized (lexicon) {
-                    for (var hash : orderedHashesBody) {
-                        lexicon.incOrderedBody(hash);
+                    for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
+                        String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
+                        orderedHashesBody.add(orderedHasher.rollingHash(terms));
                    }
-                }
-            });

-        }, p -> true);
+                    synchronized (lexicon) {
+                        for (var hash : orderedHashesBody) {
+                            lexicon.incOrderedBody(hash);
+                        }
+                    }
+                });

-        pool.shutDown();
-        pool.awaitTermination(10, TimeUnit.DAYS);
+            }, p -> true);
+        }

        lexicon.saveCounts(countsOutputFile);
    }
--- a/code/functions/language-processing/java/nu/marginalia/segmentation/NgramLexicon.java
+++ b/code/functions/language-processing/java/nu/marginalia/segmentation/NgramLexicon.java
--- a/code/functions/language-processing/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java
+++ b/code/functions/language-processing/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java
@@ -5,16 +5,19 @@ import com.google.inject.Inject;
 import com.google.inject.Singleton;
 import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
 import nu.marginalia.LanguageModels;
-import nu.marginalia.array.LongArray;
-import nu.marginalia.array.LongArrayFactory;
 import org.apache.commons.lang3.StringUtils;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;

 /** Dictionary with term frequency information for (stemmed) words.
 *
@@ -38,15 +41,23 @@ public class TermFrequencyDict {
    }

    private static Long2IntOpenHashMap load(Path file) throws IOException {
-        try (LongArray array = LongArrayFactory.mmapForReadingConfined(file)) {
+        try (Arena arena = Arena.ofConfined();
+             FileChannel fileChannel = (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ)) {

-            int size = (int) Files.size(file) / 16;
+            long fileSizeBytes = Files.size(file);
+            MemorySegment mappedFile = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSizeBytes, arena);
+
+            int size = (int) fileSizeBytes / 16;
            var ret = new Long2IntOpenHashMap(size, 0.5f);

            ret.defaultReturnValue(0);

            for (int i = 0; i < size; i++) {
-                ret.put(array.get(2 * i), (int) array.get(2 * i + 1));
+
+                long key = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i);
+                long val = mappedFile.getAtIndex(ValueLayout.JAVA_LONG, 2 * i + 1);
+
+                ret.put(key, (int) val);
            }

            return ret;
--- a/code/functions/language-processing/readme.md
+++ b/code/functions/language-processing/readme.md
@@ -0,0 +1,31 @@
+# Language Processing
+
+This function gathers various tools used in language processing,
+keyword extraction, and so on.
+
+## Language Configuration
+
+The files [resources/languages-default.xml](resources/languages-default.xml) and [resources/languages-experimental.xml](resources/languages-experimental.xml) hold the laguage definitions used by the search engine,
+the former is used in production and the latter in most tests that require language processing. 
+
+The search engine excludes any languages not configured in these files, though it is relatively easy to define a stub
+configuration that gets a simpler behavior out of the search engine.  
+
+## Language Processing Tool
+
+It also houses a tool for inspecting the output of keyword extraction,
+which can be accessed by running the command below from the root of the project.
+The tool becomes accessible on port 8080.
+
+```bash
+$ ./gradlew :code:functions:language-processing:run
+```
+
+## Central Classes
+
+* [SentenceExtractor](java/nu/marginalia/language/sentence/SentenceExtractor.java) - 
+Creates a [DocumentLanguageData](java/nu/marginalia/language/model/DocumentLanguageData.java) from a text, containing
+its words, how they stem, POS tags, and so on. 
+* [LanguageConfiguration](java/nu/marginalia/language/config/LanguageConfiguration.java) - parses langauge configuration xml files into LanguageDefinition objects
+* [LanguageDefinition](java/nu/marginalia/language/model/LanguageDefinition.java) - holds all per-language cusotmizations that are fed into the language processing pipeline
+* [DocumentKeywordExtractor](java/nu/marginalia/keyword/DocumentKeywordExtractor.java) - extracts keywords from documents
--- a/code/functions/language-processing/resources/dictionary/en-words
+++ b/code/functions/language-processing/resources/dictionary/en-words
--- a/code/functions/language-processing/resources/dictionary/word-frequency
+++ b/code/functions/language-processing/resources/dictionary/word-frequency
--- a/code/functions/language-processing/resources/languages-default.xml
+++ b/code/functions/language-processing/resources/languages-default.xml
@@ -0,0 +1,109 @@
+<?xml version="1.0"?>
+<!DOCTYPE languages [
+        <!ELEMENT languages (language*,resource*)>
+        <!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
+
+        <!ELEMENT resource EMPTY>
+        <!ATTLIST resource
+                id ID #REQUIRED
+                md5 CDATA #REQUIRED
+                path CDATA #REQUIRED
+                href CDATA #REQUIRED
+                >
+
+        <!ATTLIST language
+                isoCode ID #REQUIRED
+                name CDATA #REQUIRED
+                display (rtl|ltr) #REQUIRED
+                disabled (true|false) "false"
+                >
+
+        <!ELEMENT unicodeNormalization EMPTY>
+        <!ATTLIST unicodeNormalization
+            algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
+            >
+
+        <!ELEMENT stemmer (pospattern?)>
+        <!ATTLIST stemmer
+                algorithm (porter|snowball|none) #REQUIRED
+                variant CDATA #IMPLIED
+                >
+
+        <!ELEMENT keywordHash (#PCDATA)>
+        <!ATTLIST keywordHash
+                algorithm (asciish|utf8) #REQUIRED
+                >
+
+        <!ELEMENT rdrTagger EMPTY>
+        <!ATTLIST rdrTagger
+                dictId IDREF #REQUIRED
+                rdrId IDREF #REQUIRED
+                >
+
+        <!ELEMENT ngrams (pospattern*)>
+        <!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
+
+        <!ELEMENT pospattern (#PCDATA)>
+
+        <!ELEMENT sentenceDetector EMPTY>
+        <!ATTLIST sentenceDetector algorithm (none|opennlp) #REQUIRED>
+]>
+
+<languages>
+    <language isoCode="en" name="English" display="ltr">
+        <keywordHash algorithm="asciish" />
+        <stemmer algorithm="porter">
+            <pospattern>!(IN TO CC DT)</pospattern>
+        </stemmer>
+        <sentenceDetector algorithm="opennlp"/>
+        <unicodeNormalization algorithm="maximal-latin" />
+        <rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
+        <ngrams type="name">
+            <pospattern>NNP*</pospattern>
+            <pospattern>NNP* NNP*</pospattern>
+            <pospattern>NNP* (NNP* IN DT CC) NNP*</pospattern>
+            <pospattern>NNP* (NNP* IN DT CC) (NNP* IN DT CC) NNP*</pospattern>
+        </ngrams>
+        <ngrams type="noun">
+            <pospattern>VBG</pospattern>
+            <pospattern>RB VBG</pospattern>
+            <pospattern>(NNP* JJ)</pospattern>
+            <pospattern>(NN* JJ) NN*</pospattern>
+            <pospattern>(NN* JJ) (NN* JJ) NN*</pospattern>
+            <pospattern>(NN* JJ) (NN* JJ) (NN* JJ) NN*</pospattern>
+            <pospattern>(NNP* JJ) (NNP* IN TO CC) NNP*</pospattern>
+            <pospattern>(NNP* JJ) (NNP* IN TO CC) DT NNP*</pospattern>
+            <pospattern>(NNP* JJ) (NNP* IN TO CC) (NNP* IN TO CC) NNP*</pospattern>
+        </ngrams>
+        <ngrams type="subject-suffix">
+            <pospattern>(VBD VBZ)</pospattern>
+            <pospattern>MD VB</pospattern>
+            <pospattern>VBZ DT</pospattern>
+            <pospattern>(DT RB VBD VBP VBN JJ*) (VBD VBG VBP VBN VBZ)</pospattern>
+        </ngrams>
+        <ngrams type="title">
+            <pospattern>!(CC IN DT TO)</pospattern>
+            <pospattern>!CC !(IN DT TO)</pospattern>
+            <pospattern>!CC * !(IN DT TO)</pospattern>
+            <pospattern>!CC * * !(IN DT TO)</pospattern>
+        </ngrams>
+        <ngrams type="keyword">
+            <!-- length = 1 -->
+            <pospattern>(N* VBG VBN JJ* R* VBG)</pospattern>
+            <!-- length = 2 -->
+            <pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
+            <pospattern>(N* VBG VBN) CD</pospattern>
+            <!-- length = 3 -->
+            <pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
+            <pospattern>NNP* (IN TO CC NNP*) (N* VBG VBN)</pospattern>
+            <pospattern>(N* VBG VBN) (N* VBG VBN) CD</pospattern>
+            <!-- length = 4 -->
+            <pospattern>(N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN JJ* R* VBG) (N* VBG VBN)</pospattern>
+            <pospattern>NNP* (DT IN TO CC) (IN TO CC) NNP*</pospattern>
+        </ngrams>
+    </language>
+
+    <resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
+    <resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
+
+</languages>
--- a/code/functions/language-processing/resources/languages-experimental.xml
+++ b/code/functions/language-processing/resources/languages-experimental.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <!DOCTYPE languages [
        <!ELEMENT languages (language*,resource*)>
-        <!ELEMENT language (keywordHash,stemmer,sentenceDetector,rdrTagger?,ngrams*)>
+        <!ELEMENT language (keywordHash,unicodeNormalization,stemmer,sentenceDetector,rdrTagger?,ngrams*)>

        <!ELEMENT resource EMPTY>
        <!ATTLIST resource
@@ -18,6 +18,11 @@
                disabled (true|false) "false"
                >

+        <!ELEMENT unicodeNormalization EMPTY>
+        <!ATTLIST unicodeNormalization
+            algorithm (minimal|e-accents|german|maximal-latin) #REQUIRED
+            >
+
        <!ELEMENT stemmer (pospattern?)>
        <!ATTLIST stemmer
                algorithm (porter|snowball|none) #REQUIRED
@@ -37,6 +42,7 @@

        <!ELEMENT ngrams (pospattern*)>
        <!ATTLIST ngrams type (noun|name|subject-suffix|title|keyword) #REQUIRED>
+
        <!ELEMENT pospattern (#PCDATA)>

        <!ELEMENT sentenceDetector EMPTY>
@@ -44,18 +50,13 @@
 ]>

 <languages>
-    <language isoCode="xx" name="Undefined" display="ltr" >
-        <keywordHash algorithm="asciish" />
-        <stemmer algorithm="none" />
-        <sentenceDetector algorithm="none"/>
-        <rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
-    </language>
    <language isoCode="en" name="English" display="ltr">
        <keywordHash algorithm="asciish" />
        <stemmer algorithm="porter">
            <pospattern>!(IN TO CC DT)</pospattern>
        </stemmer>
        <sentenceDetector algorithm="opennlp"/>
+        <unicodeNormalization algorithm="maximal-latin" />
        <rdrTagger dictId="pos-dict-en" rdrId="pos-rdr-en" />
        <ngrams type="name">
            <pospattern>NNP*</pospattern>
@@ -106,6 +107,7 @@
        <keywordHash algorithm="asciish" />
        <stemmer algorithm="snowball" variant="SWEDISH" />
        <sentenceDetector algorithm="opennlp"/>
+        <unicodeNormalization algorithm="e-accents" />
        <rdrTagger dictId="pos-dict-sv" rdrId="pos-rdr-sv" />
        <ngrams type="name">
            <pospattern>PROPN</pospattern>
@@ -119,6 +121,12 @@
        <stemmer algorithm="snowball" variant="FRENCH" />
        <sentenceDetector algorithm="opennlp"/>
    </language>
+    <language isoCode="de" name="German" display="ltr">
+        <keywordHash algorithm="asciish" />
+        <stemmer algorithm="snowball" variant="GERMAN" />
+        <sentenceDetector algorithm="opennlp"/>
+        <unicodeNormalization algorithm="german" />
+    </language>
    <resource id="pos-dict-en" md5="" path="rdr/English.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.DICT" />
    <resource id="pos-rdr-en" md5="" path="rdr/English.RDR" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/POS/English.RDR" />
    <resource id="pos-dict-sv" md5="" path="rdr/Swedish.DICT" href="https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/refs/heads/master/Models/ud-treebanks-v2.4/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu.UPOS.DICT" />
--- a/code/functions/language-processing/resources/ltt/jte/keywords.jte
+++ b/code/functions/language-processing/resources/ltt/jte/keywords.jte
@@ -1,7 +1,9 @@
+@import it.unimi.dsi.fastutil.ints.IntList
@import nu.marginalia.language.model.WordRep
@import nu.marginalia.language.model.DocumentSentence
@import nu.marginalia.language.model.LanguageDefinition
@import java.util.*
+@import java.util.stream.Collectors
@import java.util.stream.IntStream

@param String textSample
@@ -13,6 +15,8 @@
@param Collection<WordRep> nameLikeReps
@param Collection<WordRep> subjectLikeReps
@param Collection<String> artifacts
+@param Collection<String> importantWords
+@param Map<String, IntList> positionedWords


 <!DOCTYPE html>
@@ -209,7 +213,7 @@
                        <div>
                            <h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
                                <i class="fas fa-star text-yellow-500 mr-2"></i>
-                                Title
+                                Artifacts
                            </h3>
                            <div class="space-y-2">
                                @for (String word : artifacts)
@@ -220,9 +224,53 @@
                            </div>
                        </div>
                    @endif
+
+                    @if (importantWords != null && !importantWords.isEmpty())
+                        <div>
+                            <h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
+                                <i class="fas fa-star text-yellow-500 mr-2"></i>
+                                Important Words
+                            </h3>
+                            <div class="space-y-2">
+                                @for (String word : importantWords)
+                                    <div class="flex justify-between items-center p-2 bg-gray-50 rounded">
+                                        <span class="text-sm font-medium">${word}</span>
+                                    </div>
+                                @endfor
+                            </div>
+                        </div>
+                    @endif
                </div>
            </div>
        </div>
+        <!-- Full simulation outcome from keyword extraction -->
+        <div class="bg-white rounded-lg shadow-sm border border-gray-200">
+            <div class="p-4 border-b border-gray-200">
+                <h2 class="text-lg font-semibold text-gray-900">
+                    <i class="fas fa-list-ol text-purple-600 mr-2"></i>
+                    Outcome
+                </h2>
+            </div>
+            <div class="p-4">
+            @if (positionedWords != null && !positionedWords.isEmpty())
+                <div>
+                    <h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
+                        <i class="fas fa-star text-yellow-500 mr-2"></i>
+                        Positioned Words
+                    </h3>
+                    <div class="space-y-2">
+                        @for (String word : positionedWords.keySet())
+                            <div class="flex justify-between items-center p-2 bg-gray-50 rounded">
+                                <span class="text-sm font-medium">${word}</span>
+                                <span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${positionedWords.get(word).stream().map(Object::toString).collect(Collectors.joining(", "))}</span>
+                            </div>
+                        @endfor
+                    </div>
+                </div>
+            @endif
+            </div>
+        </div>
+
    </div>
 </div>
 </body>
--- a/code/functions/language-processing/test-resources/html/jep.html
+++ b/code/functions/language-processing/test-resources/html/jep.html
--- a/code/functions/language-processing/test-resources/html/spam.html
+++ b/code/functions/language-processing/test-resources/html/spam.html
--- a/code/functions/language-processing/test-resources/test-data/java.html
+++ b/code/functions/language-processing/test-resources/test-data/java.html
--- a/code/functions/language-processing/test-resources/test-data/keyboards.html
+++ b/code/functions/language-processing/test-resources/test-data/keyboards.html
--- a/code/functions/language-processing/test-resources/test-data/madonna.html
+++ b/code/functions/language-processing/test-resources/test-data/madonna.html
--- a/code/functions/language-processing/test-resources/test-data/spam.html
+++ b/code/functions/language-processing/test-resources/test-data/spam.html
--- a/code/functions/language-processing/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
@@ -2,6 +2,7 @@ package nu.marginalia.keyword;

 import nu.marginalia.WmsaHome;
 import nu.marginalia.dom.DomPruningFilter;
+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.model.UnsupportedLanguageException;
 import nu.marginalia.language.sentence.SentenceExtractor;
@@ -31,7 +32,7 @@ class DocumentKeywordExtractorTest {

    @BeforeAll
    public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
-        se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
+        se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
    }

    @Test
--- a/code/functions/language-processing/test/nu/marginalia/keyword/DocumentPositionMapperTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/keyword/DocumentPositionMapperTest.java
@@ -6,6 +6,7 @@ import it.unimi.dsi.fastutil.ints.IntList;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
 import nu.marginalia.keyword.model.DocumentWordSpan;
+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.model.DocumentLanguageData;
 import nu.marginalia.language.model.DocumentSentence;
@@ -34,7 +35,7 @@ class DocumentPositionMapperTest {

    @BeforeAll
    public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
-        var config = new LanguageConfiguration(WmsaHome.getLanguageModels());
+        var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
        se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
        english = config.getLanguage("en");
    }
--- a/code/functions/language-processing/test/nu/marginalia/keyword/SentenceExtractorTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/keyword/SentenceExtractorTest.java
@@ -2,6 +2,7 @@ package nu.marginalia.keyword;

 import nu.marginalia.LanguageModels;
 import nu.marginalia.WmsaHome;
+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.model.LanguageDefinition;
 import nu.marginalia.language.model.UnsupportedLanguageException;
@@ -35,7 +36,7 @@ class SentenceExtractorTest {

    @BeforeAll
    public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
-        var config = new LanguageConfiguration(WmsaHome.getLanguageModels());
+        var config = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
        se = new SentenceExtractor(config, WmsaHome.getLanguageModels());
        english = config.getLanguage("en");

--- a/code/functions/language-processing/test/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java
@@ -1,5 +1,6 @@
 package nu.marginalia.keyword.extractors;

+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.sentence.SentenceExtractor;
 import nu.marginalia.util.TestLanguageModels;
@@ -15,7 +16,7 @@ class ArtifactKeywordsTest {

    @Test
    public void testExtractArtifacts() throws IOException, ParserConfigurationException, SAXException {
-        SentenceExtractor se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels()), TestLanguageModels.getLanguageModels());
+        SentenceExtractor se = new SentenceExtractor(new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental()), TestLanguageModels.getLanguageModels());

        var artifacts = new ArtifactKeywords(se.extractSentences("Hello I'm <vlofgren@marginalia.nu>, what's up?", "hello!"));
        System.out.println(artifacts.getWords());
--- a/code/functions/language-processing/test/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java
@@ -3,6 +3,7 @@ package nu.marginalia.keyword.extractors;
 import com.google.common.collect.Sets;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.dom.DomPruningFilter;
+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.model.LanguageDefinition;
 import nu.marginalia.language.model.UnsupportedLanguageException;
@@ -55,8 +56,8 @@ class NameLikeKeywordsTest {

    @BeforeAll
    public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
-        se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
-        lc = new LanguageConfiguration(WmsaHome.getLanguageModels());
+        se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
+        lc = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
        en = lc.getLanguage("en");
    }

--- a/code/functions/language-processing/test/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java
@@ -2,6 +2,7 @@ package nu.marginalia.keyword.extractors;

 import com.google.common.collect.Sets;
 import nu.marginalia.WmsaHome;
+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.sentence.SentenceExtractor;
 import nu.marginalia.term_frequency_dict.TermFrequencyDict;
@@ -49,7 +50,7 @@ class SubjectLikeKeywordsTest {

    @BeforeAll
    public static void setUpAll() throws IOException, ParserConfigurationException, SAXException {
-        se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels()), WmsaHome.getLanguageModels());
+        se = new SentenceExtractor(new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental()), WmsaHome.getLanguageModels());
    }

    @Test
--- a/code/functions/language-processing/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java
@@ -1,6 +1,7 @@
 package nu.marginalia.keyword.extractors;

 import com.google.common.collect.Sets;
+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.model.UnsupportedLanguageException;
 import nu.marginalia.language.sentence.SentenceExtractor;
@@ -192,7 +193,7 @@ class TitleKeywordsTest {

    @Test
    public void extractTitleWords() throws IOException, ParserConfigurationException, SAXException, UnsupportedLanguageException {
-        var languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels());
+        var languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental());
        var se = new SentenceExtractor(languageConfiguration, TestLanguageModels.getLanguageModels());

        var dld = se.extractSentences(Jsoup.parse(document));
--- a/code/functions/language-processing/test/nu/marginalia/keyword/extractors/UrlKeywordsTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/keyword/extractors/UrlKeywordsTest.java
--- a/code/functions/language-processing/test/nu/marginalia/language/config/LanguageConfigurationTestFile.java
+++ b/code/functions/language-processing/test/nu/marginalia/language/config/LanguageConfigurationTestFile.java
@@ -13,19 +13,18 @@ import java.io.IOException;

 import static org.junit.jupiter.api.Assertions.assertEquals;

-public class LanguageConfigurationTest {
+public class LanguageConfigurationTestFile {
    private static LanguageConfiguration languageConfiguration;

    @BeforeAll
    public static void setUpAll() throws IOException, SAXException, ParserConfigurationException {
-        languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels());
+        languageConfiguration = new LanguageConfiguration(TestLanguageModels.getLanguageModels(), new LanguageConfigLocation.Experimental());
    }

    @Test
    void testBasic() {
        Assertions.assertNotNull(languageConfiguration.getLanguage("en"));
        Assertions.assertNotNull(languageConfiguration.getLanguage("sv"));
-        Assertions.assertNotNull(languageConfiguration.getLanguage("xx"));
        Assertions.assertNull(languageConfiguration.getLanguage("!!"));
    }

--- a/code/functions/language-processing/test/nu/marginalia/language/encoding/UnicodeNormalizationTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/language/encoding/UnicodeNormalizationTest.java
@@ -0,0 +1,41 @@
+package nu.marginalia.language.encoding;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+
+class UnicodeNormalizationTest {
+
+    UnicodeNormalization unicodeNormalization = new UnicodeNormalization.FlattenAllLatin();
+
+    @Test
+    void flattenUnicodePlainAscii() {
+        String s = "abc";
+
+        // If the string is ascii, we don't want to allocate a copy
+
+        assertSame(s, unicodeNormalization.flattenUnicode(s));
+    }
+
+    @Test
+    void flattenUnicode() {
+        String s = "Stülpnagelstraße";
+
+        assertEquals("Stulpnagelstrasse", unicodeNormalization.flattenUnicode(s));
+    }
+
+    @Test
+    void flattenUnicode2() {
+        String s = "Koncevičius";
+
+        assertEquals("Koncevicius", unicodeNormalization.flattenUnicode(s));
+    }
+
+    @Test
+    void omitNonFlattenable() {
+        String s = "[アグレッシブ烈子]";
+
+        assertEquals("[]", unicodeNormalization.flattenUnicode(s));
+    }
+}
--- a/code/functions/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java
+++ b/code/functions/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java
--- a/code/functions/language-processing/test/nu/marginalia/language/pos/PosPatternTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/language/pos/PosPatternTest.java
--- a/code/functions/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java
@@ -1,6 +1,7 @@
 package nu.marginalia.language.sentence;

 import nu.marginalia.WmsaHome;
+import nu.marginalia.language.config.LanguageConfigLocation;
 import nu.marginalia.language.config.LanguageConfiguration;
 import nu.marginalia.language.model.UnsupportedLanguageException;
 import nu.marginalia.language.sentence.tag.HtmlTag;
@@ -11,6 +12,7 @@ import org.xml.sax.SAXException;

 import javax.xml.parsers.ParserConfigurationException;
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.Objects;

@@ -23,7 +25,7 @@ class SentenceExtractorTest {

    @BeforeAll
    public static void setUp() throws IOException, ParserConfigurationException, SAXException {
-        languageConfig = new LanguageConfiguration(WmsaHome.getLanguageModels());
+        languageConfig = new LanguageConfiguration(WmsaHome.getLanguageModels(), new LanguageConfigLocation.Experimental());
        sentenceExtractor = new SentenceExtractor(languageConfig, WmsaHome.getLanguageModels());
    }

@@ -60,7 +62,8 @@ class SentenceExtractorTest {
    void testJava() {
        var dld = sentenceExtractor.extractSentence(languageConfig.getLanguage("en"), "Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));

-        assertEquals(4, dld.wordsLowerCase.length);
+        System.out.println(Arrays.toString(dld.wordsLowerCase));
+
        assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase);
    }

--- a/code/functions/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java
--- a/code/functions/language-processing/test/nu/marginalia/segmentation/HasherGroupTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/segmentation/HasherGroupTest.java
--- a/code/functions/language-processing/test/nu/marginalia/segmentation/NgramLexiconTest.java
+++ b/code/functions/language-processing/test/nu/marginalia/segmentation/NgramLexiconTest.java
--- a/code/functions/language-processing/test/nu/marginalia/util/TestLanguageModels.java
+++ b/code/functions/language-processing/test/nu/marginalia/util/TestLanguageModels.java
--- a/code/functions/live-capture/java/nu/marginalia/domsample/DomSampleService.java
+++ b/code/functions/live-capture/java/nu/marginalia/domsample/DomSampleService.java
@@ -3,8 +3,10 @@ package nu.marginalia.domsample;
 import com.google.inject.Inject;
 import com.zaxxer.hikari.HikariDataSource;
 import jakarta.inject.Named;
+import nu.marginalia.coordination.DomainCoordinator;
 import nu.marginalia.domsample.db.DomSampleDb;
 import nu.marginalia.livecapture.BrowserlessClient;
+import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.service.module.ServiceConfiguration;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
@@ -15,24 +17,32 @@ import java.net.URISyntaxException;
 import java.time.Duration;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.TimeUnit;

 public class DomSampleService {
    private final DomSampleDb db;
    private final HikariDataSource mariadbDataSource;
+    private final int sampleThreads;
+    private final DomainCoordinator domainCoordinator;
    private final URI browserlessURI;

    private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
+    private final ArrayBlockingQueue<EdgeDomain> samplingQueue = new ArrayBlockingQueue<>(4);

    @Inject
    public DomSampleService(DomSampleDb db,
                            HikariDataSource mariadbDataSource,
                            @Named("browserless-uri") String browserlessAddress,
+                            @Named("browserless-sample-threads") int sampleThreads,
+                            DomainCoordinator domainCoordinator,
                            ServiceConfiguration serviceConfiguration)
            throws URISyntaxException
    {
        this.db = db;
        this.mariadbDataSource = mariadbDataSource;
+        this.sampleThreads = sampleThreads;
+        this.domainCoordinator = domainCoordinator;

        if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
            logger.warn("Live capture service will not run");
@@ -40,6 +50,7 @@ public class DomSampleService {
        }
        else {
            browserlessURI = new URI(browserlessAddress);
+
        }
    }

@@ -49,7 +60,10 @@ public class DomSampleService {
            return;
        }

-        Thread.ofPlatform().daemon().start(this::run);
+        Thread.ofPlatform().daemon().start(this::mainThread);
+        for (int i = 0; i < sampleThreads; i++) {
+            Thread.ofPlatform().daemon().start(this::samplingThread);
+        }
    }

    public void syncDomains() {
@@ -61,7 +75,11 @@ public class DomSampleService {
            var stmt = conn.prepareStatement("""
                SELECT DOMAIN_NAME 
                FROM EC_DOMAIN 
+                INNER JOIN DOMAIN_AVAILABILITY_INFORMATION
+                ON EC_DOMAIN.ID=DOMAIN_ID
                WHERE NODE_AFFINITY>0
+                AND BACKOFF_CONSECUTIVE_FAILURES<15
+                AND HTTP_SCHEMA='HTTPS'
                """)
        ) {
            var rs = stmt.executeQuery();
@@ -79,7 +97,7 @@ public class DomSampleService {
        logger.info("Synced domains to sqlite");
    }

-    public void run() {
+    public void mainThread() {

        try (var client = new BrowserlessClient(browserlessURI)) {

@@ -92,8 +110,8 @@ public class DomSampleService {
                    syncDomains();
                    var domains = db.getScheduledDomains();

-                    for (var domain : domains) {
-                        updateDomain(client, domain);
+                    for (String domain : domains) {
+                        samplingQueue.put(new EdgeDomain(domain));
                    }
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
@@ -103,7 +121,26 @@ public class DomSampleService {
                    logger.error("Error in DomSampleService run loop", e);
                }
            }
+        }
+    }

+    void samplingThread() {
+        try (var client = new BrowserlessClient(browserlessURI)) {
+            while (!Thread.currentThread().isInterrupted()) {
+                try {
+                    EdgeDomain domain = samplingQueue.take();
+                    try (var lock = domainCoordinator.lockDomain(domain)) {
+                        updateDomain(client, domain.toString());
+                    } catch (Exception e) {
+                        logger.error("Error in DomSampleService run loop", e);
+                    }
+                }
+                catch (InterruptedException ex) {
+                    Thread.currentThread().interrupt();
+                    logger.info("DomSampleService interrupted, stopping...");
+                    return;
+                }
+            }
        }
    }

--- a/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
+++ b/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
@@ -41,7 +41,7 @@ public class BrowserlessClient implements AutoCloseable {
    public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
        Map<String, Object> requestData = Map.of(
                "url", url,
-                "userAgent", userAgent,
+                "userAgent", Map.of("userAgent", userAgent),
                "gotoOptions", gotoOptions
        );

@@ -69,7 +69,7 @@ public class BrowserlessClient implements AutoCloseable {
    public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
        Map<String, Object> requestData = Map.of(
                "url", url,
-                "userAgent", userAgent,
+                "userAgent", Map.of("userAgent", userAgent),
                "gotoOptions", gotoOptions,
                "waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
        );
@@ -104,7 +104,7 @@ public class BrowserlessClient implements AutoCloseable {

        Map<String, Object> requestData = Map.of(
                "url", url,
-                "userAgent", userAgent,
+                "userAgent", Map.of("userAgent", userAgent),
                "options", screenshotOptions,
                "gotoOptions", gotoOptions
        );
--- a/code/functions/live-capture/java/nu/marginalia/livecapture/LiveCaptureGrpcService.java
+++ b/code/functions/live-capture/java/nu/marginalia/livecapture/LiveCaptureGrpcService.java
@@ -6,6 +6,7 @@ import io.grpc.stub.StreamObserver;
 import jakarta.inject.Named;
 import nu.marginalia.api.livecapture.Empty;
 import nu.marginalia.api.livecapture.LiveCaptureApiGrpc;
+import nu.marginalia.coordination.DomainCoordinator;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.service.module.ServiceConfiguration;
 import nu.marginalia.service.server.DiscoverableService;
@@ -33,6 +34,7 @@ public class LiveCaptureGrpcService
    private final boolean serviceEnabled;
    private final LinkedBlockingQueue<ScheduledScreenshot> requestedScreenshots = new LinkedBlockingQueue<>(128);
    private final HikariDataSource dataSource;
+    private final DomainCoordinator domainCoordinator;

    record ScheduledScreenshot(int domainId) {}

@@ -46,9 +48,11 @@ public class LiveCaptureGrpcService
    public LiveCaptureGrpcService(HikariDataSource dataSource,
                                  @Named("browserless-uri") String browserlessAddress,
                                  @Named("browserless-agent-threads") int threads,
+                                  DomainCoordinator domainCoordinator,
                                  ServiceConfiguration serviceConfiguration
                                  ) throws URISyntaxException {
        this.dataSource = dataSource;
+        this.domainCoordinator = domainCoordinator;

        if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
            logger.warn("Live capture service will not run");
@@ -163,7 +167,7 @@ public class LiveCaptureGrpcService
        }

        private void grab(BrowserlessClient client, Connection conn, EdgeDomain domain) {
-            try {
+            try (var lock = domainCoordinator.lockDomain(domain)) {
                logger.info("Capturing {}", domain);

                byte[] pngBytes = client.screenshot(domain.toRootUrlHttps().toString(),
--- a/code/functions/live-capture/java/nu/marginalia/livecapture/LivecaptureModule.java
+++ b/code/functions/live-capture/java/nu/marginalia/livecapture/LivecaptureModule.java
@@ -11,5 +11,8 @@ public class LivecaptureModule extends AbstractModule {
        bind(Integer.class)
                .annotatedWith(Names.named("browserless-agent-threads"))
                .toInstance(Integer.parseInt(System.getProperty("live-capture.browserless-agent-threads", "4")));
+        bind(Integer.class)
+                .annotatedWith(Names.named("browserless-sample-threads"))
+                .toInstance(Integer.parseInt(System.getProperty("live-capture.browserless-sample-threads", "4")));
    }
 }
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@@ -110,18 +110,6 @@ public class FeedFetcherService {
                .build()
        );

-        Thread.ofPlatform().daemon(true).start(() -> {
-            try {
-                for (;;) {
-                    TimeUnit.SECONDS.sleep(15);
-                    logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
-                }
-            }
-            catch (InterruptedException e) {
-                Thread.currentThread().interrupt();
-            }
-        });
-
        final RequestConfig defaultRequestConfig = RequestConfig.custom()
                .setCookieSpec(StandardCookieSpec.IGNORE)
                .setResponseTimeout(10, TimeUnit.SECONDS)
--- a/code/functions/search-query/api/build.gradle
+++ b/code/functions/search-query/api/build.gradle
@@ -22,8 +22,7 @@ dependencies {
    implementation project(':code:common:model')
    implementation project(':code:common:config')
    implementation project(':code:common:service')
-    implementation project(':code:index:query')
-    implementation project(':code:libraries:language-processing')
+    implementation project(':code:functions:language-processing')

    implementation libs.bundles.slf4j

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Viktor Lofgren	f1a71e9033	(ndp) Deprioritize tumblr in the visitation order	2025-10-05 12:17:46 +02:00
Viktor Lofgren	7b525918c9	(ndp) Deprioritize tumblr in the visitation order	2025-10-05 12:16:05 +02:00
Viktor Lofgren	0f3aede66f	(ndp) Clean up code	2025-10-05 11:56:41 +02:00
Viktor Lofgren	88236f3836	(ndp) Use mariadb syntax instead of sqlite syntax when querying mariadb	2025-10-05 11:56:31 +02:00
Viktor Lofgren	ad31a22fbb	(ndp) Refresh the ndp queue on restart	2025-10-05 10:32:05 +02:00
Viktor Lofgren	2785ae8241	(language) Further amend the docs to mention the language configuration files	2025-10-05 09:04:12 +02:00
Viktor Lofgren	1ed1f2f299	(language) Update documentation for the language processing function	2025-10-04 11:20:24 +02:00
Viktor Lofgren	b7d3b67a1d	(language) Fix language configuration stub for German to not use French stemming	2025-10-02 10:15:30 +02:00
Viktor Lofgren	d28010b7e6	(search) Fix pagination in light mode	2025-10-02 09:04:49 +02:00
Viktor Lofgren	2689bd9eaa	(chore) Update to Java 25 Unbreak test suites	2025-10-02 09:04:25 +02:00
Viktor Lofgren	f6d5d7f196	(chore) Update to Java 25 As usual most of the change is dealing with gradle churn.	2025-09-30 15:59:35 +02:00
Viktor	abf1186fa7	Merge pull request #231 from johnvonessen/feature/configurable-crawler-timeouts feat: Make crawler timeouts configurable via system.properties	2025-09-30 13:47:07 +02:00
John Von Essen	94a77ebddf	Fix timeout configuration test to expect exceptions for invalid values - Update testInvalidTimeoutValues to expect Exception when invalid timeout values are provided - This matches the actual behavior where negative timeouts cause IllegalArgumentException - All timeout configuration tests now pass	2025-09-30 13:39:58 +02:00
John Von Essen	4e2f76a477	feat: Make crawler timeouts configurable via system.properties - Add configurable timeout properties for HTTP client operations: - crawler.socketTimeout (default: 10s) - crawler.connectTimeout (default: 30s) - crawler.responseTimeout (default: 10s) - crawler.connectionRequestTimeout (default: 5min) - crawler.jvmConnectTimeout (default: 30000ms) - crawler.jvmReadTimeout (default: 30000ms) - crawler.httpClientIdleTimeout (default: 15s) - crawler.httpClientConnectionPoolSize (default: 256) - Update HttpFetcherImpl to use Integer.getInteger() for timeout configuration - Update CrawlerMain and LiveCrawlerMain to use configurable JVM timeouts - Add comprehensive documentation in crawler readme.md - Add test coverage for timeout configuration functionality This allows users to tune crawler timeouts for their specific network conditions without requiring code changes, improving operational flexibility. # Conflicts: # code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java	2025-09-30 13:39:52 +02:00
Viktor	4cd1834938	Merge pull request #232 from johnvonessen/socks-support Add SOCKS proxy support for crawler processes	2025-09-30 13:32:14 +02:00
Viktor Lofgren	5cbbea67ed	(docs) Update documentation with more appropriate best practices	2025-09-30 13:31:23 +02:00
Viktor Lofgren	b688f15550	(proxy) Fix late binding of proxy configuration The code was selecting the proxy too late, so that it ended up being hardcoded for the entire crawl run, thus breaking the proxy selection logic. There was also a problem where the socket configuration was overwritten by another socket configuration, thus disabling the proxy injection.	2025-09-30 11:48:43 +02:00
Viktor Lofgren	f55af8ef48	(boot) Explicitly stop ndp and ping processes at first boot The system has sometimes been observed starting the NDP and Ping processes automatically, which is strongly undesirable as these microcrawlers generate real web traffic. It is not fully understood how this happened, but the first boot handler has been modified to explicitly stop them, which should prevent the problem; and seems to have the desired outcome during testing.	2025-09-30 09:29:04 +02:00
Viktor Lofgren	adc815e282	(language) Add outcome of a simulation of the complete outcome of keyword extraction to the language processing tool	2025-09-28 12:45:25 +02:00
Viktor Lofgren	ca8455e049	(live-capture) Use threads instead of FJP for coordination of sampling	2025-09-25 10:13:46 +02:00
Viktor Lofgren	4ea724d2cb	(live-capture) Use threads instead of FJP for coordination of sampling	2025-09-25 10:10:46 +02:00
Viktor Lofgren	40600e7297	(live-capture) Use threads instead of FJP for coordination of sampling	2025-09-25 10:10:05 +02:00
Viktor Lofgren	7795742538	(live-capture) Use threads instead of FJP for coordination of sampling	2025-09-25 10:06:12 +02:00
Viktor Lofgren	82d33ce69b	(assistant) Add domain coordination module	2025-09-25 09:57:32 +02:00
Viktor Lofgren	e49cc5c244	(live-capture) Add domain coordination, make sampling parallel	2025-09-25 09:55:50 +02:00
Viktor Lofgren	0af389ad93	(live-capture) Use availability information to select domains for sampling more intelligently	2025-09-24 18:22:37 +02:00
Viktor Lofgren	48791f56bd	(index) Put back Chesterton's fence	2025-09-24 16:09:54 +02:00
Viktor Lofgren	be83726427	(query) Remove log noise from query service	2025-09-24 16:06:01 +02:00
Viktor Lofgren	708caa8791	(index) Update verbatim match handling to account for matches that span multiple tags	2025-09-24 15:43:00 +02:00
Viktor Lofgren	32394f42b9	(index) Update verbatim match handling to account for matches that span multiple tags	2025-09-24 15:41:53 +02:00
Viktor Lofgren	b8e3445ce0	(index) Update verbatim match handling to account for matches that span multiple tags	2025-09-24 15:22:50 +02:00
Viktor Lofgren	17a78a7b7e	(query) Remove obsolete code	2025-09-24 15:03:08 +02:00
Viktor Lofgren	5a75dd8093	(index) Update james cook test	2025-09-24 15:02:13 +02:00
Viktor Lofgren	a9713347a0	(query) Submit all segmentations as optional matching groups	2025-09-24 15:01:59 +02:00
Viktor Lofgren	4694d36ed2	(index) Tweak ranking bonuses for partial matches	2025-09-24 15:01:29 +02:00
Viktor Lofgren	70bdd1f51e	(index) Add test case for 'captain james cook'	2025-09-24 13:27:07 +02:00
Viktor Lofgren	187b4828e6	(index) Sort doc ids passed to re-ranking	2025-09-24 13:26:53 +02:00
Viktor Lofgren	93fc14dc94	(index) Add sanity assertions to SkipListReader	2025-09-24 13:26:31 +02:00
Viktor Lofgren	fbfea8539b	(refac) Merge IndexResultScoreCalculator into IndexResultRankingService	2025-09-24 11:51:16 +02:00
Viktor Lofgren	0929d77247	(chore) Remove vestigial Serializable annotation from a few core models Java serialization was briefly considered a long while ago, but it's a silly and ancient API and not something we want to use.	2025-09-24 10:42:10 +02:00
Viktor Lofgren	db8f8c1f55	(index) Fix bitmask handling in HtmlFeature	2025-09-23 10:15:01 +02:00
Viktor Lofgren	dcb2723386	(index) Fix broken test case in the "slow" collection	2025-09-23 10:13:51 +02:00
Viktor Lofgren	00c1f495f6	(index) Fix incorrect document flag bitmask handling	2025-09-23 10:12:14 +02:00
Viktor Lofgren	73a923983a	(language) Fix outdated test assertion	2025-09-22 10:30:06 +02:00
Viktor Lofgren	e9ed0c5669	(language) Fix keyword pattern matching unicode handling	2025-09-22 10:27:46 +02:00
Viktor Lofgren	5b2bec6144	(search) Fix broken tests	2025-09-22 10:17:38 +02:00
Viktor Lofgren	f26bb8e2b1	(loader) Clean up the code Loader code is still kinda needlessly convoluted for what it does, but this commit makes an effort toward making it a bit easier to follow along.	2025-09-22 10:14:54 +02:00
Viktor Lofgren	4455495dc6	(system) Fix file loggers in the json config	2025-09-21 19:02:18 +02:00
Viktor Lofgren	b84d17aa51	(system) Fix file loggers in the prod config	2025-09-21 14:02:41 +02:00
Viktor Lofgren	9d008390ae	(language) Fix unicode issues in keyword extraction	2025-09-21 13:54:01 +02:00
Viktor Lofgren	a40c2a8146	(index) Partition index journal by language to speed up index construction	2025-09-21 13:53:43 +02:00
Viktor Lofgren	a3416bf48e	(query) Fix timeout settings to use ms and not s	2025-09-19 22:45:22 +02:00
Viktor Lofgren	ee2461d9fc	(query) Fix timeout settings to use ms and not us	2025-09-19 22:19:31 +02:00
Viktor Lofgren	54c91a84e3	(query) Make the query client give up if the request exceeds its configured timeout by 50%	2025-09-19 18:59:35 +02:00
Viktor Lofgren	a6371fc54c	(query) Add a timeout to the query API	2025-09-19 18:52:44 +02:00
Viktor Lofgren	8faa9a572d	(live-capture) Fix random puppeteer API churn	2025-09-19 11:15:38 +02:00
Viktor Lofgren	fdce940263	(search) Fix redundant spam in <title>	2025-09-19 10:20:14 +02:00
Viktor Lofgren	af8a13a7fb	(index) Correct file name compatibility with previous versions	2025-09-19 09:40:43 +02:00
Viktor	9e332de6b4	Merge pull request #223 from MarginaliaSearch/multilingual Add support for indexing multiple languages	2025-09-19 09:12:54 +02:00
Viktor Lofgren	d457bb5d44	(index) Fix index actor initialization	2025-09-18 16:06:40 +02:00
Viktor Lofgren	c661ebb619	(refac) Move language-processing into functions It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.	2025-09-18 10:30:40 +02:00
Viktor Lofgren	53e744398a	Update gitignore to exclude eclipse-generated stuff	2025-09-17 17:14:02 +02:00
Viktor Lofgren	1d71baf3e5	(search) Display search query first in title	2025-09-16 13:16:18 +02:00
Viktor Lofgren	bb5fc0f348	(language) Fix sketchy unicode handling in UnicodeNormalization	2025-09-16 12:15:09 +02:00
Viktor Lofgren	c8f112d040	(lang+search) Clean up LanguageConfiguration initialization and LangCommandD	2025-09-16 11:49:46 +02:00
Viktor Lofgren	ae31bc8498	(lang+search) Clean up LanguageConfiguration initialization and LangCommand	2025-09-16 11:47:15 +02:00
Viktor Lofgren	da5046c3bf	(lang) Remove language redirects for languages that are not configured Passing an invalid &lang= to the query service leads to a harmless but ugly stacktrace. This change prevents such a request from being formed.	2025-09-16 11:05:31 +02:00
Viktor Lofgren	f67257baf2	(lang) Remove lang:... keyword during LangCommand	2025-09-16 11:01:11 +02:00
Viktor Lofgren	924fb05661	(config) Fix language config pickup	2025-09-16 10:43:27 +02:00
Viktor Lofgren	c231a82062	(search) Lang redirection works better if it's hooked in	2025-09-16 10:40:24 +02:00
Viktor Lofgren	2c1082d7f0	(search) Add notice about the current language selection to the UI	2025-09-16 10:32:13 +02:00
Viktor Lofgren	06947bd026	(search) Add redirect based on lang:-keyword in search query The change also suppresses the term in the query parser so that it isn't delegated to the index as a keyword.	2025-09-16 10:00:20 +02:00
Viktor Lofgren	519aebd7c6	(process) Make the use of zookeeper based domain coordination optional The zookeeper based domain coordinator has been a bit unstable and lead to rare deadlocks. As running multiple instances of the crawler is an unusual configuration, the default behavior that makes the most sense is to disable cross-process coordination and use only local coordination.	2025-09-15 19:13:57 +02:00
Viktor Lofgren	42cc27586e	(process) Reduce connection pool stats log spam	2025-09-15 18:51:43 +02:00
Viktor Lofgren	360881fafd	(setup) Pull POS tags from control svc on first boot This commit also removes the old retrieval from setup.sh	2025-09-15 10:05:17 +02:00
Viktor Lofgren	4c6fdf6ebe	(language) Make language configuration configurable	2025-09-15 09:54:57 +02:00
Viktor Lofgren	554de21f68	(converter) Disable language keyword	2025-09-15 09:49:04 +02:00
Viktor Lofgren	00194acbfe	(search) Add language chooser to UI, clean up search service code	2025-09-13 12:40:42 +02:00
Viktor Lofgren	97dabcefaa	(search) Add language chooser to UI, clean up search service code	2025-09-13 12:34:34 +02:00
Viktor Lofgren	cc790644d4	(search) Persist language choice in the search form	2025-09-12 11:14:54 +02:00
Viktor Lofgren	8f893ee6c0	(search) Add basic support for configuring query language to the search service This is not visible in the UI at this stage, only a query param.	2025-09-11 15:55:09 +02:00
Viktor Lofgren	938721b793	(index) Backwards compatible loading of old words file in index loading	2025-09-11 15:42:31 +02:00
Viktor Lofgren	f68bcefc75	(index) Correct index construction to use the correct files for Fwd index	2025-09-09 11:21:48 +02:00
John Von Essen	164a646af6	Fix SOCKS proxy property propagation to spawned processes - Add SOCKS proxy system properties to ProcessSpawnerService - Ensures crawler.socksProxy.* properties are passed to spawned crawler processes - Fixes issue where SOCKS proxy configuration was loaded by control service but not inherited by spawned crawler processes This resolves the root cause of SOCKS proxy not working in crawler processes.	2025-09-09 01:02:00 +00:00
Viktor Lofgren	0cfd759f85	(deps) Upgrade slop to 0.17 for better skip performance and faster index construction times	2025-09-08 18:02:34 +02:00
Viktor Lofgren	b53002200c	(index) SkipListWriter should not be in APPEND mode	2025-09-08 17:55:14 +02:00
Viktor Lofgren	78246b9a63	(index) Fix journal language enumeration	2025-09-08 15:38:26 +02:00
Viktor Lofgren	b552e79927	(language) Make LanguageConfiguration a Singleton to avoid duplicate initializations	2025-09-08 13:24:18 +02:00
Viktor Lofgren	bffc159486	(language) Make unicode normalization configurable	2025-09-08 13:18:58 +02:00
John Von Essen	b8000721bd	Implement proper SOCKS proxy support for HTTP Components v5 - Replace placeholder implementation with working SocketConfig.setSocksProxyAddress() - Remove complex ConnectionSocketFactory code that wasn't compatible with v5 - Simplify SocksProxyHttpClientFactory to use correct v5 API - Fix Timeout vs TimeValue compilation error - SOCKS proxy configuration now fully functional Resolves the incomplete implementation and enables proper proxy routing.	2025-09-07 21:49:21 +00:00
John Von Essen	2ee0b0e420	Fix SOCKS proxy implementation for HTTP Components v5 - Add missing libs.bundles.httpcomponents dependency to build.gradle - Fix SocksProxy class references to use SocksProxyConfiguration.SocksProxy - Update HTTP Components API calls to match v5 interface signatures - Fix ConnectionSocketFactory method signatures (TimeValue, HttpHost parameters) - Remove invalid setConnectTimeout() calls on Socket class - Add placeholder implementation for v5 SOCKS proxy configuration Resolves compilation errors and provides foundation for proper v5 implementation.	2025-09-06 21:39:20 +00:00
Viktor Lofgren	1432fc87d7	(index) Test languages via integration test	2025-09-06 20:11:41 +02:00
John Von Essen	ec5f32b1d8	Add SOCKS proxy support for crawler processes - Add SocksProxyConfiguration, SocksProxyManager, and SocksProxyHttpClientFactory classes - Integrate SOCKS proxy support into all crawler HTTP clients - Support round-robin and random proxy selection strategies - Add comprehensive documentation for SOCKS proxy configuration - Configure via system properties for easy deployment	2025-09-05 10:42:58 -04:00
Viktor Lofgren	edd453531e	(index) Partition keyword lexicons by language	2025-09-04 17:24:48 +02:00
Viktor Lofgren	096496ada1	(refac) Fold ft-anchor-keywords into converting-process	2025-09-03 13:04:30 +02:00
Viktor Lofgren	8ca6209260	(refac) Fold ft-anchor-keywords into converting-process	2025-09-03 13:03:38 +02:00
Viktor Lofgren	673c65d3c9	(refac) Fold term-frequency-dict into language-processing	2025-09-03 12:59:10 +02:00
Viktor Lofgren	acb9ec7b15	(refac) Consistently use 'languageIsoCode' for the language field	2025-09-03 12:54:18 +02:00
Viktor Lofgren	47079e05db	(index) Store language information in the index journal	2025-09-03 12:33:24 +02:00
Viktor Lofgren	c93056e77f	(refac) Clean up index code	2025-09-03 09:51:57 +02:00
Viktor Lofgren	6f7530e807	(refac) Clean up index code	2025-09-02 18:53:58 +02:00
Viktor Lofgren	87ce4a1b52	(refac) Clean up index code	2025-09-02 17:52:38 +02:00
Viktor Lofgren	52194cbe7a	(refac) Clean up index code	2025-09-02 17:44:42 +02:00
Viktor Lofgren	fd1ac03c78	(refac) Clean up index code	2025-09-02 17:30:19 +02:00
Viktor Lofgren	5e5b86efb4	(refac) Clean up index code	2025-09-02 17:24:30 +02:00
Viktor Lofgren	f332ec6191	(refac) Clean up index code	2025-09-02 13:13:10 +02:00
Viktor Lofgren	c25c1af437	(refac) Clean up index code	2025-09-02 13:04:05 +02:00
Viktor Lofgren	eb0c911b45	(refac) Clean up index code	2025-09-02 12:50:07 +02:00
Viktor Lofgren	1979870ce4	(refac) Merge index-forward, index-reverse, index/query into index The project has too many submodules, and it's a bit of a headache to navigate.	2025-09-02 12:30:42 +02:00
Viktor Lofgren	0ba2ea38e1	(index) Move reverse index into a distinct package	2025-09-02 11:59:56 +02:00
Viktor Lofgren	b46f2e1407	(sideload) Remove upper limit on XML entities This unfucks the sideloading of stackexchange definitions. This broke some time when we merged the executor service into the index service.	2025-08-31 14:14:09 +02:00
Viktor Lofgren	18aa1b9764	(zim) Fix parsing of modern wikipedia zim files The parser was relying on a deprecated format and wikipedia has stopped generating zim files that work with the old assumptions. The new approach should hopefully work better.	2025-08-31 12:52:44 +02:00