mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
180 lines
7.2 KiB
Java
180 lines
7.2 KiB
Java
package nu.marginalia.language;
|
|
|
|
import io.jooby.Context;
|
|
import io.jooby.Jooby;
|
|
import io.jooby.MapModelAndView;
|
|
import io.jooby.ModelAndView;
|
|
import nu.marginalia.LanguageModels;
|
|
import nu.marginalia.WmsaHome;
|
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
|
import nu.marginalia.keyword.LinkTexts;
|
|
import nu.marginalia.keyword.extractors.*;
|
|
import nu.marginalia.language.config.LanguageConfigLocation;
|
|
import nu.marginalia.language.config.LanguageConfiguration;
|
|
import nu.marginalia.language.model.DocumentLanguageData;
|
|
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
|
import nu.marginalia.model.EdgeUrl;
|
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
|
import org.jetbrains.annotations.NotNull;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.net.URISyntaxException;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.util.*;
|
|
|
|
public class LanguageProcessingTool extends Jooby {
|
|
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
|
|
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
|
private final TermFrequencyDict termFrequencyDict;
|
|
|
|
static void main(String[] args) {
|
|
Jooby.runApp(args, LanguageProcessingTool::new);
|
|
}
|
|
|
|
public LanguageProcessingTool() {
|
|
try {
|
|
LanguageModels languageModels = getLanguageModels();
|
|
termFrequencyDict = new TermFrequencyDict(languageModels);
|
|
|
|
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
|
|
new LanguageConfiguration(languageModels, new LanguageConfigLocation.Experimental()),
|
|
languageModels
|
|
);
|
|
|
|
// Depending on how the tool is started, we may be in the project root, or the module root;
|
|
// so here's some guesswork to try to suss out which one it is...
|
|
Path basePath = Path.of("code/functions/language-processing/").toAbsolutePath();
|
|
if (!Files.exists(basePath)) {
|
|
basePath = Path.of(".").toAbsolutePath();
|
|
}
|
|
|
|
System.out.println("Base path: " + basePath);
|
|
|
|
if (Files.exists(basePath.resolve("resources/ltt/jte")))
|
|
install(new nu.marginalia.service.server.jte.JteModule(basePath.resolve("resources/ltt/jte")));
|
|
if (Files.exists(basePath.resolve("resources/ltt/static")))
|
|
assets("/*", basePath.resolve("resources/ltt/static"));
|
|
|
|
get("/", this::handleKeywords);
|
|
post("/", this::handleKeywords);
|
|
}
|
|
catch (Exception ex) {
|
|
logger.error("Failed to initialize LanguageProcessingTool", ex);
|
|
throw new RuntimeException(ex);
|
|
}
|
|
}
|
|
// Assign colors to the POS tags
|
|
|
|
@NotNull
|
|
private ModelAndView<?> handleKeywords(Context context) throws URISyntaxException {
|
|
if ("GET".equals(context.getMethod())) {
|
|
return new MapModelAndView("keywords.jte")
|
|
.put("textSample", "");
|
|
}
|
|
else if (!"POST".equals(context.getMethod())) {
|
|
throw new IllegalArgumentException("Invalid method");
|
|
}
|
|
|
|
String textSample = context.form("textSample").value();
|
|
|
|
// Run sentende extration on the text as-is
|
|
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(textSample);
|
|
|
|
// Run individual extraction logic
|
|
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, dld);
|
|
var titleKeywords = new TitleKeywords(dld);
|
|
var nameLikeKeywords = new NameLikeKeywords(dld, 2);
|
|
var subjectLikeKeywords = new SubjectLikeKeywords(tfIdfCounts, dld);
|
|
var artifactKeywords = new ArtifactKeywords(dld);
|
|
|
|
// Run full extraction logic to capture positioning etc
|
|
var extractedKeywords = new DocumentKeywordExtractor(termFrequencyDict)
|
|
.extractKeywords(dld, new LinkTexts(), new EdgeUrl("https://www.example.com/"));
|
|
|
|
return new MapModelAndView("keywords.jte")
|
|
.put("textSample", textSample)
|
|
.put("language", dld.language())
|
|
.put("tagColors", posTagStyles(dld))
|
|
.put("sentences", dld.sentences())
|
|
.put("tfIdfReps", tfIdfCounts.getReps())
|
|
.put("titleReps", titleKeywords.getReps())
|
|
.put("nameLikeReps", nameLikeKeywords.getReps())
|
|
.put("subjectLikeReps", subjectLikeKeywords.getReps())
|
|
.put("artifacts", artifactKeywords.getWords())
|
|
.put("importantWords", extractedKeywords.importantWords)
|
|
.put("positionedWords", extractedKeywords.wordToPos);
|
|
}
|
|
|
|
/**
|
|
* Generate unique colors for each POS tag, to help the UI rendering
|
|
*/
|
|
public static Map<Long, String> posTagStyles(DocumentLanguageData dld) {
|
|
Map<Long, String> styles = new HashMap<>();
|
|
|
|
// we sort them first to ensure the most common tags are guaranteed to have
|
|
// the largest difference between colors
|
|
|
|
Map<Long, Integer> counts = new HashMap<>();
|
|
for (var sentence : dld.sentences()) {
|
|
for (var tag : sentence.posTags) {
|
|
counts.merge(tag, 1, Integer::sum);
|
|
}
|
|
}
|
|
|
|
List<Long> posTagsByCount = counts
|
|
.entrySet().stream()
|
|
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
|
|
.map(Map.Entry::getKey)
|
|
.toList();
|
|
|
|
|
|
for (int i = 0; i < posTagsByCount.size(); i++) {
|
|
String style = "text-" + switch (i&0x7) {
|
|
case 0 -> "red";
|
|
case 1 -> "green";
|
|
case 2 -> "blue";
|
|
case 3 -> "yellow";
|
|
case 4 -> "purple";
|
|
case 5 -> "cyan";
|
|
case 6 -> "pink";
|
|
default -> "gray";
|
|
}+"-"+switch((i/8) & 3) {
|
|
case 0 -> "900";
|
|
case 3 -> "500";
|
|
case 1 -> "750";
|
|
case 2 -> "400";
|
|
default -> "300";
|
|
};
|
|
styles.put(posTagsByCount.get(i), style);
|
|
}
|
|
return styles;
|
|
}
|
|
|
|
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
|
private static Path getLanguageModelsPath() {
|
|
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
|
.map(Path::of)
|
|
.orElse(LANGUAGE_MODELS_DEFAULT);
|
|
|
|
if (!Files.isDirectory(languageModelsHome)) {
|
|
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
|
}
|
|
return languageModelsHome;
|
|
}
|
|
private static LanguageModels getLanguageModels() {
|
|
|
|
var languageModelsHome = getLanguageModelsPath();
|
|
|
|
return new LanguageModels(
|
|
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
|
languageModelsHome.resolve("English.RDR"),
|
|
languageModelsHome.resolve("English.DICT"),
|
|
languageModelsHome.resolve("lid.176.ftz"),
|
|
languageModelsHome.resolve("segments.bin")
|
|
);
|
|
}
|
|
}
|