mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(language) Integrate old keyword extraction logic with new test tool
This commit is contained in:
@@ -6,9 +6,12 @@ import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -20,7 +23,7 @@ import java.util.*;
|
||||
public class LanguageProcessingTool extends Jooby {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageProcessingTool.class);
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
|
||||
private final TermFrequencyDict termFrequencyDict;
|
||||
static void main(String[] args) {
|
||||
Jooby.runApp(args, LanguageProcessingTool::new);
|
||||
}
|
||||
@@ -28,6 +31,7 @@ public class LanguageProcessingTool extends Jooby {
|
||||
public LanguageProcessingTool() {
|
||||
try {
|
||||
LanguageModels languageModels = getLanguageModels();
|
||||
termFrequencyDict = new TermFrequencyDict(languageModels);
|
||||
|
||||
sentenceExtractorProvider = new ThreadLocalSentenceExtractorProvider(
|
||||
new LanguageConfiguration(languageModels),
|
||||
@@ -65,13 +69,24 @@ public class LanguageProcessingTool extends Jooby {
|
||||
var dld = sentenceExtractorProvider.get().extractSentences(textSample);
|
||||
Map<String, String> posStyles = posTagStyles(dld);
|
||||
|
||||
System.out.println(posStyles);
|
||||
KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
var tfIdfCounts = new WordsTfIdfCounts(termFrequencyDict, keywordExtractor, dld);
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, dld.language().isoCode(), tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
// var urlKeywords = new UrlKeywords(url);
|
||||
|
||||
return new MapModelAndView("keywords.jte")
|
||||
.put("textSample", textSample)
|
||||
.put("language", dld.language())
|
||||
.put("tagColors", posStyles)
|
||||
.put("sentences", dld.sentences());
|
||||
.put("sentences", dld.sentences())
|
||||
.put("tfIdfReps", tfIdfCounts.getReps())
|
||||
.put("titleReps", titleKeywords.getReps())
|
||||
.put("nameLikeReps", nameLikeKeywords.getReps())
|
||||
.put("subjectLikeReps", subjectLikeKeywords.getReps())
|
||||
.put("artifacts", artifactKeywords.getWords());
|
||||
}
|
||||
|
||||
public static Map<String, String> posTagStyles(DocumentLanguageData dld) {
|
||||
|
@@ -1,14 +1,19 @@
|
||||
@import nu.marginalia.language.LanguageProcessingTool
|
||||
@import nu.marginalia.language.model.WordRep
|
||||
@import nu.marginalia.language.model.DocumentSentence
|
||||
@import nu.marginalia.language.model.LanguageDefinition
|
||||
@import java.util.List
|
||||
@import java.util.Map
|
||||
@import java.util.*
|
||||
@import java.util.stream.IntStream
|
||||
|
||||
@param String textSample
|
||||
@param LanguageDefinition language
|
||||
@param List<DocumentSentence> sentences
|
||||
@param Map<String, String> tagColors
|
||||
@param Collection<WordRep> tfIdfReps
|
||||
@param Collection<WordRep> titleReps
|
||||
@param Collection<WordRep> nameLikeReps
|
||||
@param Collection<WordRep> subjectLikeReps
|
||||
@param Collection<String> artifacts
|
||||
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
@@ -115,74 +120,85 @@
|
||||
<div class="p-4">
|
||||
<div class="grid grid-cols-1 md:grid-cols-3 gap-6">
|
||||
<!-- Keywords -->
|
||||
@if (tfIdfReps != null && !tfIdfReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Keywords (TF-IDF)
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : tfIdfReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">language</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">0.87</span>
|
||||
</div>
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">processing</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">0.82</span>
|
||||
</div>
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">fascinating</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">0.75</span>
|
||||
</div>
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">natural</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">0.71</span>
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Bigrams -->
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-link text-blue-500 mr-2"></i>
|
||||
Bigrams
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
<div class="p-2 bg-blue-50 border border-blue-200 rounded">
|
||||
<span class="text-sm font-medium">natural language</span>
|
||||
</div>
|
||||
<div class="p-2 bg-blue-50 border border-blue-200 rounded">
|
||||
<span class="text-sm font-medium">language processing</span>
|
||||
</div>
|
||||
<div class="p-2 bg-blue-50 border border-blue-200 rounded">
|
||||
<span class="text-sm font-medium">quick brown</span>
|
||||
</div>
|
||||
<div class="p-2 bg-blue-50 border border-blue-200 rounded">
|
||||
<span class="text-sm font-medium">brown fox</span>
|
||||
@endif
|
||||
@if (nameLikeReps != null && !nameLikeReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Name-Like
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : nameLikeReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Trigrams -->
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-sitemap text-green-500 mr-2"></i>
|
||||
Trigrams
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
<div class="p-2 bg-green-50 border border-green-200 rounded">
|
||||
<span class="text-sm font-medium">natural language processing</span>
|
||||
</div>
|
||||
<div class="p-2 bg-green-50 border border-green-200 rounded">
|
||||
<span class="text-sm font-medium">the quick brown</span>
|
||||
</div>
|
||||
<div class="p-2 bg-green-50 border border-green-200 rounded">
|
||||
<span class="text-sm font-medium">quick brown fox</span>
|
||||
</div>
|
||||
<div class="p-2 bg-green-50 border border-green-200 rounded">
|
||||
<span class="text-sm font-medium">over the lazy</span>
|
||||
@endif
|
||||
@if (subjectLikeReps != null && !subjectLikeReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Subject-Like
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : subjectLikeReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (titleReps != null && !titleReps.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Title
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (WordRep rep : titleReps)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${rep.word}</span>
|
||||
<span class="text-xs text-gray-600 bg-yellow-100 px-2 py-1 rounded">${rep.length}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@if (artifacts != null && !artifacts.isEmpty())
|
||||
<div>
|
||||
<h3 class="text-sm font-semibold text-gray-700 mb-3 flex items-center">
|
||||
<i class="fas fa-star text-yellow-500 mr-2"></i>
|
||||
Title
|
||||
</h3>
|
||||
<div class="space-y-2">
|
||||
@for (String word : artifacts)
|
||||
<div class="flex justify-between items-center p-2 bg-gray-50 rounded">
|
||||
<span class="text-sm font-medium">${word}</span>
|
||||
</div>
|
||||
@endfor
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
Reference in New Issue
Block a user