1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(language) Integrate old keyword extraction logic with new test tool

This commit is contained in:
Viktor Lofgren
2025-08-25 09:50:05 +02:00
parent f0741142a3
commit ce221d3a0e
2 changed files with 91 additions and 60 deletions

View File

@@ -6,9 +6,12 @@ import io.jooby.MapModelAndView;
import io.jooby.ModelAndView;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -20,7 +23,7 @@ import java.util.*;
public class LanguageProcessingTool extends Jooby {
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final TermFrequencyDict termFrequencyDict;
static void main(String[] args) {
Jooby.runApp(args, LanguageProcessingTool::new);
}
@@ -28,6 +31,7 @@ public class LanguageProcessingTool extends Jooby {
public LanguageProcessingTool() {
try {
LanguageModels languageModels = getLanguageModels();
termFrequencyDict = new TermFrequencyDict(languageModels);
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
new LanguageConfiguration(languageModels),
@@ -65,13 +69,24 @@ public class LanguageProcessingTool extends Jooby {
var dld = sentenceExtractorProvider.get().extractSentences(textSample);
Map<String, String> posStyles = posTagStyles(dld);
System.out.println(posStyles);
KeywordExtractor keywordExtractor = new KeywordExtractor();
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, keywordExtractor, dld);
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, dld.language().isoCode(), tfIdfCounts, dld);
var artifactKeywords = new ArtifactKeywords(dld);
// var urlKeywords = new UrlKeywords(url);
return new MapModelAndView("keywords.jte")
.put("textSample", textSample)
.put("language", dld.language())
.put("tagColors", posStyles)
.put("sentences", dld.sentences());
.put("sentences", dld.sentences())
.put("tfIdfReps", tfIdfCounts.getReps())
.put("titleReps", titleKeywords.getReps())
.put("nameLikeReps", nameLikeKeywords.getReps())
.put("subjectLikeReps", subjectLikeKeywords.getReps())
.put("artifacts", artifactKeywords.getWords());
}
public static Map<String, String> posTagStyles(DocumentLanguageData dld) {

View File

@@ -1,14 +1,19 @@
@import nu.marginalia.language.LanguageProcessingTool
@import nu.marginalia.language.model.WordRep
@import nu.marginalia.language.model.DocumentSentence
@import nu.marginalia.language.model.LanguageDefinition
@import java.util.List
@import java.util.Map
@import java.util.*
@import java.util.stream.IntStream
@param String textSample
@param LanguageDefinition language
@param List<DocumentSentence> sentences
@param Map<String, String> tagColors
@param Collection<WordRep> tfIdfReps
@param Collection<WordRep> titleReps
@param Collection<WordRep> nameLikeReps
@param Collection<WordRep> subjectLikeReps
@param Collection<String> artifacts
<!DOCTYPE html>
<html lang="en">
@@ -115,74 +120,85 @@
<div class="p-4">
<div class="grid grid-cols-1 md:grid-cols-3 gap-6">
<!-- Keywords -->
@if (tfIdfReps != null && !tfIdfReps.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Keywords (TF-IDF)
</h3>
<div class="space-y-2">
@for (WordRep rep : tfIdfReps)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">language</span>
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">0.87</span>
</div>
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">processing</span>
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">0.82</span>
</div>
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">fascinating</span>
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">0.75</span>
</div>
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">natural</span>
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">0.71</span>
<span class="text-sm font-medium">${rep.word}</span>
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
</div>
@endfor
</div>
</div>
<!-- Bigrams -->
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-link text-blue-500 mr-2"></i>
Bigrams
</h3>
<div class="space-y-2">
<div class="p-2 bg-blue-50 border border-blue-200 rounded">
<span class="text-sm font-medium">natural language</span>
</div>
<div class="p-2 bg-blue-50 border border-blue-200 rounded">
<span class="text-sm font-medium">language processing</span>
</div>
<div class="p-2 bg-blue-50 border border-blue-200 rounded">
<span class="text-sm font-medium">quick brown</span>
</div>
<div class="p-2 bg-blue-50 border border-blue-200 rounded">
<span class="text-sm font-medium">brown fox</span>
@endif
@if (nameLikeReps != null && !nameLikeReps.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Name-Like
</h3>
<div class="space-y-2">
@for (WordRep rep : nameLikeReps)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${rep.word}</span>
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
</div>
@endfor
</div>
</div>
</div>
<!-- Trigrams -->
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-sitemap text-green-500 mr-2"></i>
Trigrams
</h3>
<div class="space-y-2">
<div class="p-2 bg-green-50 border border-green-200 rounded">
<span class="text-sm font-medium">natural language processing</span>
</div>
<div class="p-2 bg-green-50 border border-green-200 rounded">
<span class="text-sm font-medium">the quick brown</span>
</div>
<div class="p-2 bg-green-50 border border-green-200 rounded">
<span class="text-sm font-medium">quick brown fox</span>
</div>
<div class="p-2 bg-green-50 border border-green-200 rounded">
<span class="text-sm font-medium">over the lazy</span>
@endif
@if (subjectLikeReps != null && !subjectLikeReps.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Subject-Like
</h3>
<div class="space-y-2">
@for (WordRep rep : subjectLikeReps)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${rep.word}</span>
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
</div>
@endfor
</div>
</div>
</div>
@endif
@if (titleReps != null && !titleReps.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Title
</h3>
<div class="space-y-2">
@for (WordRep rep : titleReps)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${rep.word}</span>
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
</div>
@endfor
</div>
</div>
@endif
@if (artifacts != null && !artifacts.isEmpty())
<div>
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
<i class="fas fa-star text-yellow-500 mr-2"></i>
Title
</h3>
<div class="space-y-2">
@for (String word : artifacts)
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
<span class="text-sm font-medium">${word}</span>
</div>
@endfor
</div>
</div>
@endif
</div>
</div>
</div>