1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/functions/language-processing/java/nu/marginalia/language/LanguageProcessingTool.java
Viktor Lofgren c661ebb619 (refac) Move language-processing into functions
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
2025-09-18 10:30:40 +02:00

158 lines
6.2 KiB
Java

package nu.marginalia.language;
import io.jooby.Context;
import io.jooby.Jooby;
import io.jooby.MapModelAndView;
import io.jooby.ModelAndView;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.language.config.LanguageConfigLocation;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
public class LanguageProcessingTool extends Jooby {
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final TermFrequencyDict termFrequencyDict;
static void main(String[] args) {
Jooby.runApp(args, LanguageProcessingTool::new);
}
public LanguageProcessingTool() {
try {
LanguageModels languageModels = getLanguageModels();
termFrequencyDict = new TermFrequencyDict(languageModels);
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
languageModels
);
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
System.out.println("Base path: " + basePath);
if (Files.exists(basePath.resolve("resources/ltt/jte")))
install(new nu.marginalia.service.server.jte.JteModule(basePath.resolve("resources/ltt/jte")));
if (Files.exists(basePath.resolve("resources/ltt/static")))
assets("/*", basePath.resolve("resources/ltt/static"));
get("/", this::handleKeywords);
post("/", this::handleKeywords);
}
catch (Exception ex) {
logger.error("Failed to initialize LanguageProcessingTool", ex);
throw new RuntimeException(ex);
}
}
// Assign colors to the POS tags
@NotNull
private ModelAndView<?> handleKeywords(Context context) {
if ("GET".equals(context.getMethod())) {
return new MapModelAndView("keywords.jte")
.put("textSample", "");
}
else if (!"POST".equals(context.getMethod())) {
throw new IllegalArgumentException("Invalid method");
}
String textSample = context.form("textSample").value();
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
Map<Long, String> posStyles = posTagStyles(dld);
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, dld);
var titleKeywords = new TitleKeywords(dld);
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
var artifactKeywords = new ArtifactKeywords(dld);
// var urlKeywords = new UrlKeywords(url);
return new MapModelAndView("keywords.jte")
.put("textSample", textSample)
.put("language", dld.language())
.put("tagColors", posStyles)
.put("sentences", dld.sentences())
.put("tfIdfReps", tfIdfCounts.getReps())
.put("titleReps", titleKeywords.getReps())
.put("nameLikeReps", nameLikeKeywords.getReps())
.put("subjectLikeReps", subjectLikeKeywords.getReps())
.put("artifacts", artifactKeywords.getWords());
}
public static Map<Long, String> posTagStyles(DocumentLanguageData dld) {
Map<Long, String> styles = new HashMap<>();
// we sort them first to ensure the most common tags are guaranteed to have
// the largest difference between colors
Map<Long, Integer> counts = new HashMap<>();
for (var sentence : dld.sentences()) {
for (var tag : sentence.posTags) {
counts.merge(tag, 1, Integer::sum);
}
}
List<Long> posTagsByCount = counts
.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.map(Map.Entry::getKey)
.toList();
for (int i = 0; i < posTagsByCount.size(); i++) {
String style = "text-" + switch (i&0x7) {
case 0 -> "red";
case 1 -> "green";
case 2 -> "blue";
case 3 -> "yellow";
case 4 -> "purple";
case 5 -> "cyan";
case 6 -> "pink";
default -> "gray";
}+"-"+switch((i/8) & 3) {
case 0 -> "900";
case 3 -> "500";
case 1 -> "750";
case 2 -> "400";
default -> "300";
};
styles.put(posTagsByCount.get(i), style);
}
return styles;
}
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
private static Path getLanguageModelsPath() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
return languageModelsHome;
}
private static LanguageModels getLanguageModels() {
var languageModelsHome = getLanguageModelsPath();
return new LanguageModels(
languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("lid.176.ftz"),
languageModelsHome.resolve("segments.bin")
);
}
}